def __init__(self, containerBibleObject: Bible, BBB: str) -> None: """ Create the uW OBS Bible book object. """ BibleBook.__init__(self, containerBibleObject, BBB) # Initialise the base class self.objectNameString = 'uW OBS Bible Book object' self.objectTypeString = 'uW OBS'
def __validateAndExtractBook(self, book, bookNumber): """ Check/validate and extract book data from the given XML book record finding chapter subelements. """ vPrint('Verbose', debuggingThisModule, _("Validating XML book…")) # Process the div attributes first BBB = bookName = None for attrib, value in book.items(): if attrib == "n": bookName = value else: logging.warning( "Unprocessed {!r} attribute ({}) in book element".format( attrib, value)) if bookName: BBB = self.genericBOS.getBBBFromText(bookName) if BBB is None: adjustedBookName = BibleOrgSysGlobals.removeAccents(bookName) if adjustedBookName != bookName: BBB = self.genericBOS.getBBBFromText(adjustedBookName) BBB2 = BibleOrgSysGlobals.loadedBibleBooksCodes.getBBBFromReferenceNumber( bookNumber) if BBB2 != BBB: # Just double check using the book number if BibleOrgSysGlobals.debugFlag or BibleOrgSysGlobals.verbosityLevel > 2: vPrint( 'Quiet', debuggingThisModule, "Assuming that book {} {!r} is {} (not {})".format( bookNumber, bookName, BBB2, BBB)) BBB = BBB2 #vPrint( 'Quiet', debuggingThisModule, BBB ); halt if BBB: vPrint('Info', debuggingThisModule, _("Validating {} {}…").format(BBB, bookName)) thisBook = BibleBook(self, BBB) thisBook.objectNameString = 'VerseView XML Bible Book object' thisBook.objectTypeString = 'VerseView' #thisBook.sourceFilepath = self.sourceFilepath for element in book: if element.tag == VerseViewXMLBible.chapterTag: sublocation = "chapter in {}".format(BBB) BibleOrgSysGlobals.checkXMLNoText(element, sublocation, 'j3jd') BibleOrgSysGlobals.checkXMLNoTail(element, sublocation, 'al1d') self.__validateAndExtractChapter(BBB, thisBook, element) else: logging.error( "vb26 Expected to find {!r} but got {!r}".format( VerseViewXMLBible.chapterTag, element.tag)) vPrint('Info', debuggingThisModule, " Saving {} into results…".format(BBB)) self.stashBook(thisBook)
def __init__(self, containerBibleObject: Bible, BBB: str) -> None: """ Create the USFM2 Bible book object. """ BibleBook.__init__(self, containerBibleObject, BBB) # Initialise the base class self.objectNameString = 'USFM2 Bible Book object' self.objectTypeString = 'USFM2' global sortedNLMarkers if sortedNLMarkers is None: sortedNLMarkers = sorted( USFM2Markers.getNewlineMarkersList('Combined'), key=len, reverse=True)
def load(self): """ Load a single source file and load book elements. """ vPrint('Info', debuggingThisModule, _("Loading {}…").format(self.sourceFilepath)) if self.suppliedMetadata is None: self.suppliedMetadata = {} self.suppliedMetadata['Unbound'] = {} lastLine, lineCount = '', 0 BBB = None NRSVA_bookCode = NRSVA_chapterNumberString = NRSVA_verseNumberString = None subverseNumberString = sequenceNumberString = None lastBookCode = lastChapterNumber = lastVerseNumber = lastSequence = -1 lastVText = '' with open(self.sourceFilepath, encoding=self.encoding ) as myFile: # Automatically closes the file when done for line in myFile: lineCount += 1 #if lineCount==1 and self.encoding.lower()=='utf-8' and line[0]==chr(65279): #U+FEFF #logging.info( " UnboundBible.load: Detected Unicode Byte Order Marker (BOM)" ) #line = line[1:] # Remove the Unicode Byte Order Marker (BOM) if line and line[-1] == '\n': line = line[:-1] # Removing trailing newline character if not line: continue # Just discard blank lines lastLine = line #vPrint( 'Quiet', debuggingThisModule, 'UB file line is "' + line + '"' ) if line[0] == '#': hashBits = line[1:].split('\t') if len(hashBits) == 2 and hashBits[ 1]: # We have some valid meta-data self.suppliedMetadata['Unbound'][ hashBits[0]] = hashBits[1] #if hashBits[0] == 'name': self.name = hashBits[1] #elif hashBits[0] == 'filetype': self.filetype = hashBits[1] #elif hashBits[0] == 'copyright': self.copyright = hashBits[1] #elif hashBits[0] == 'abbreviation': self.abbreviation = hashBits[1] #elif hashBits[0] == 'language': self.language = hashBits[1] #elif hashBits[0] == 'note': self.note = hashBits[1] #elif hashBits[0] == 'columns': self.columns = hashBits[1] #logging.warning( "Unknown UnboundBible meta-data field {!r} = {!r}".format( hashBits[0], hashBits[1] ) ) continue # Just discard comment lines bits = line.split('\t') #vPrint( 'Quiet', debuggingThisModule, self.givenName, BBB, bits ) if len(bits) == 4: bookCode, chapterNumberString, verseNumberString, vText = bits elif len(bits) == 6: bookCode, chapterNumberString, verseNumberString, subverseNumberString, sequenceNumberString, vText = bits elif len(bits) == 9: NRSVA_bookCode, NRSVA_chapterNumberString, NRSVA_verseNumberString, bookCode, chapterNumberString, verseNumberString, subverseNumberString, sequenceNumberString, vText = bits elif len(bits) == 1 and self.givenName.startswith( 'lxx_a_parsing_'): logging.warning( _("Skipping bad {!r} line in {} {} {} {}:{}").format( line, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) continue else: vPrint('Quiet', debuggingThisModule, "Unexpected number of bits", self.givenName, BBB, bookCode, chapterNumberString, verseNumberString, len(bits), bits) halt if NRSVA_bookCode: assert len(NRSVA_bookCode) == 3 if NRSVA_chapterNumberString: assert NRSVA_chapterNumberString.isdigit() if NRSVA_verseNumberString: assert NRSVA_verseNumberString.isdigit() if not bookCode and not chapterNumberString and not verseNumberString: vPrint( 'Quiet', debuggingThisModule, "Skipping empty line in {} {} {} {}:{}".format( self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) continue if BibleOrgSysGlobals.debugFlag: assert len(bookCode) == 3 if BibleOrgSysGlobals.debugFlag: assert chapterNumberString.isdigit() if BibleOrgSysGlobals.debugFlag: assert verseNumberString.isdigit() if subverseNumberString: logging.warning( _("subverseNumberString {!r} in {} {} {}:{}").format( subverseNumberString, BBB, bookCode, chapterNumberString, verseNumberString)) vText = vText.strip() # Remove leading and trailing spaces if not vText: continue # Just ignore blank verses I think if vText == '+': continue # Not sure what this means in basic_english JHN 1:38 chapterNumber = int(chapterNumberString) verseNumber = int(verseNumberString) if sequenceNumberString: if BibleOrgSysGlobals.debugFlag: assert sequenceNumberString.isdigit() sequenceNumber = int(sequenceNumberString) if BibleOrgSysGlobals.debugFlag: assert sequenceNumber > lastSequence or \ self.givenName in ('gothic_latin', 'hebrew_bhs_consonants', 'hebrew_bhs_vowels', 'latvian_nt', 'ukrainian_1871',) # Why??? lastSequence = sequenceNumber if bookCode != lastBookCode: # We've started a new book if lastBookCode != -1: # Better save the last book self.stashBook(thisBook) BBB = BibleOrgSysGlobals.loadedBibleBooksCodes.getBBBFromUnboundBibleCode( bookCode) thisBook = BibleBook(self, BBB) thisBook.objectNameString = 'Unbound Bible Book object' thisBook.objectTypeString = 'Unbound' lastBookCode = bookCode lastChapterNumber = lastVerseNumber = -1 if chapterNumber != lastChapterNumber: # We've started a new chapter if BibleOrgSysGlobals.debugFlag: assert chapterNumber > lastChapterNumber or BBB == 'ESG' # Esther Greek might be an exception if chapterNumber == 0: logging.info( "Have chapter zero in {} {} {} {}:{}".format( self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) thisBook.addLine('c', chapterNumberString) lastChapterNumber = chapterNumber lastVerseNumber = -1 # Handle the verse info if verseNumber == lastVerseNumber and vText == lastVText: logging.warning( _("Ignored duplicate verse line in {} {} {} {}:{}"). format(self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) continue if BBB == 'PSA' and verseNumberString == '1' and vText.startswith( '<') and self.givenName == 'basic_english': # Move Psalm titles to verse zero verseNumber = 0 if verseNumber < lastVerseNumber: logging.warning( _("Ignored receding verse number (from {} to {}) in {} {} {} {}:{}" ).format(lastVerseNumber, verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) elif verseNumber == lastVerseNumber: if vText == lastVText: logging.warning( _("Ignored duplicated {} verse in {} {} {} {}:{}"). format(verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) else: logging.warning( _("Ignored duplicated {} verse number in {} {} {} {}:{}" ).format(verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) thisBook.addLine('v', verseNumberString + ' ' + vText) lastVText = vText lastVerseNumber = verseNumber # Save the final book self.stashBook(thisBook) self.applySuppliedMetadata('Unbound') # Copy some to self.settingsDict self.doPostLoadProcessing()
class GreekNT( Bible ): """ Class for handling a Greek NT object (which may contain one or more Bible books) Note: BBB is used in this class to represent the three-character referenceAbbreviation. """ def __init__( self, sourceFilepath, givenName=None, encoding='utf-8' ) -> None: """ Constructor: expects the filepath of the source folder. Loads (and crudely validates the file(s)) into ???. """ # Setup and initialise the base class first Bible.__init__( self ) self.objectNameString = 'Greek NT Bible object' self.objectTypeString = 'GreekNT' # Now we can set our object variables self.sourceFilepath, self.givenName, self.encoding = sourceFilepath, givenName, encoding self.title = self.version = self.date = None self.XMLTree = self.header = self.frontMatter = self.divs = self.divTypesString = None #self.bkData, self.USFMBooks = {}, {} self.lang = self.language = None # Do a preliminary check on the readability of our files self.possibleFilenames = [] if os.path.isdir( self.sourceFilepath ): # We've been given a folder -- see if we can find the files # There's no standard for OSIS xml file naming fileList = os.listdir( self.sourceFilepath ) #dPrint( 'Quiet', debuggingThisModule, len(fileList), fileList ) # First try looking for OSIS book names for filename in fileList: if filename.lower().endswith('.txt'): thisFilepath = os.path.join( self.sourceFilepath, filename ) #if BibleOrgSysGlobals.debugFlag: vPrint( 'Quiet', debuggingThisModule, "Trying {}…".format( thisFilepath ) ) if os.access( thisFilepath, os.R_OK ): # we can read that file self.possibleFilenames.append( filename ) elif not os.access( self.sourceFilepath, os.R_OK ): logging.critical( "GreekNT: File {!r} is unreadable".format( self.sourceFilepath ) ) return # No use continuing #dPrint( 'Quiet', debuggingThisModule, self.possibleFilenames ); halt self.name = self.givenName #gNTfc = GreekNTFileConverter( self.sourceFilepath ) # Load and process the XML #gNTfc.loadMorphGNT() #self.books = gNTfc.bookData # end of __init__ #def x__str__( self ) -> str: #""" #This method returns the string representation of a Bible book code. #@return: the name of a Bible object formatted as a string #@rtype: string #""" #result = "Greek Bible converter object" ##if self.title: result += ('\n' if result else '') + self.title ##if self.version: result += ('\n' if result else '') + "Version: {} ".format( self.version ) ##if self.date: result += ('\n' if result else '') + "Date: {}".format( self.date ) #if len(self.books)==1: #for BBB in self.books: break # Just get the first one #result += ('\n' if result else '') + " " + _("Contains one book: {}").format( BBB ) #else: result += ('\n' if result else '') + " " + _("Number of books = {:,}").format( len(self.books) ) #return result ## end of __str__ def loadBooks( self ): """ """ vPrint( 'Info', debuggingThisModule, _("Loading Greek NT from {}…").format( self.sourceFilepath ) ) for BBB in Greek.morphgntBookList: self.loadBook( BBB, Greek.morphgntFilenameDict[BBB] ) vPrint( 'Verbose', debuggingThisModule, "{} books loaded.".format( len(self.books) ) ) #if self.possibleFilenames: # then we possibly have multiple files, probably one for each book #for filename in self.possibleFilenames: #pathname = os.path.join( self.sourceFilepath, filename ) #self.loadBook( pathname ) #else: # most often we have all the Bible books in one file #self.loadFile( self.sourceFilepath ) self.doPostLoadProcessing() # end of loadBooks def load( self ): self.loadBooks() def loadBook( self, BBB:str, filename, encoding='utf-8' ): def unpackLine( line ): # Should be seven parts in the line # 0 book/chapter/verse # 1 part of speech (POS) # 2 parsing code # 3 text (including punctuation) # 4 word (with punctuation stripped) # 5 normalized word # 6 lemma # e.g., 180101 N- ----NSM- Παῦλος Παῦλος Παῦλος Παῦλος # 180102 N- ----DSF- ⸀ἀδελφῇ ἀδελφῇ ἀδελφῇ ἀδελφή # 180102 P- -------- κατ’ κατ’ κατά κατά # 180102 N- ----DSF- ἐκκλησίᾳ· ἐκκλησίᾳ ἐκκλησίᾳ ἐκκλησία bits = line.split() assert len(bits) == 7 #dPrint( 'Quiet', debuggingThisModule, bits ) bn, cn, vn = bits[0][0:2], bits[0][2:4], bits[0][4:6] if bn[0]=='0': bn = bn[1:] # Remove any leading zero if cn[0]=='0': cn = cn[1:] # Remove any leading zero if vn[0]=='0': vn = vn[1:] # Remove any leading zero #dPrint( 'Quiet', debuggingThisModule, b, c, v ) POSCode = bits[1] assert len(POSCode) == 2 assert POSCode in Greek.POSCodes.keys() parsingCode = bits[2] assert len(parsingCode) == 8 #dPrint( 'Quiet', debuggingThisModule, parsingCode ) for j,char in enumerate(parsingCode): assert char in Greek.parsingCodes[j] assert parsingCode[0] in Greek.personCodes assert parsingCode[1] in Greek.tenseCodes assert parsingCode[2] in Greek.voiceCodes assert parsingCode[3] in Greek.modeCodes assert parsingCode[4] in Greek.caseCodes assert parsingCode[5] in Greek.numberCodes assert parsingCode[6] in Greek.genderCodes assert parsingCode[7] in Greek.degreeCodes return (bn,cn,vn,), (POSCode,parsingCode,), (bits[3],bits[4],bits[5],bits[6],) # end of unpackLine self.thisBook = BibleBook( self, BBB ) self.thisBook.objectNameString = 'Morph Greek NT Bible Book object' self.thisBook.objectTypeString = 'MorphGNT' filepath = os.path.join( self.sourceFilepath, filename ) vPrint( 'Info', debuggingThisModule, " Loading {}…".format( filename ) ) lastLine, lineCount = '', 0 lastC = lastV = None with open( filepath, encoding=encoding ) as myFile: # Automatically closes the file when done if 1: #try: for line in myFile: lineCount += 1 if lineCount==1 and encoding.lower()=='utf-8' and line and line[0]==chr(65279): #U+FEFF logging.info( "GreekNT: Detected Unicode Byte Order Marker (BOM) in {}".format( filename ) ) line = line[1:] # Remove the Unicode Byte Order Marker (BOM) if line and line[-1]=='\n': line = line[:-1] # Removing trailing newline character #if not line: continue # Just discard blank lines lastLine = line #dPrint( 'Quiet', debuggingThisModule, 'gNT file line is "' + line + '"' ) #if line[0]=='#': continue # Just discard comment lines unpackedLine = unpackLine( line ) #dPrint( 'Quiet', debuggingThisModule, unpackedLine ) ref, grammar, words = unpackedLine bn, cn, vn = ref POSCode, parsingCode = grammar word1, word2, word3, word4 = words if cn != lastC: self.thisBook.addLine( 'c', cn ) lastC, lastV = cn, None if vn != lastV: self.thisBook.addLine( 'v', vn ) lastV = vn self.thisBook.addLine( 'vw', "{}/{}/{}/{}".format( word1, word2, word3, word4 ) ) self.thisBook.addLine( 'g', "{}/{}".format( POSCode, parsingCode ) ) #reference = BBB,bits[0][1],bits[0][2], # Put the BBB into the reference #lineTuples.append( (reference,bits[1],bits[2],) ) #dPrint( 'Quiet', debuggingThisModule, reference,bits[1],bits[2] ); halt #if 0: #except: #logging.critical( "Invalid line in " + filepath + " -- line ignored at " + str(lineCount) ) #if lineCount > 1: vPrint( 'Quiet', debuggingThisModule, 'Previous line was: ', lastLine ) #else: vPrint( 'Quiet', debuggingThisModule, 'Possible encoding error -- expected', encoding ) if self.thisBook: vPrint( 'Verbose', debuggingThisModule, " {} words loaded from {}".format( len(self.thisBook), filename ) ) self.stashBook( self.thisBook ) #self.books[BBB] = self.thisBook # end of loadBook def analyzeWords( self ): """ Go through the NT data and do some filing and sorting of the Greek words. Used by the interlinearizer app. """ vPrint( 'Verbose', debuggingThisModule, "analyzeWords: have {} books in the loaded NT".format( len(self.books) ) ) self.wordCounts = {} # Wordcount organised by BBB self.wordCounts['Total'] = 0 self.actualWordsToNormalized, self.normalizedWordsToActual, self.normalizedWordsToParsing, self.lemmasToNormalizedWords = {}, {}, {}, {} for BBB in self.books: wordCount = len(self.books[BBB]) self.wordCounts[BBB] = wordCount self.wordCounts['Total'] += wordCount vPrint( 'Verbose', debuggingThisModule, " analyzeWords: {} has {} Greek words".format( BBB, wordCount ) ) for reference,parsing,(punctuatedWord,actualWord,normalizedWord,lemma) in self.books[BBB]: # Stuff is: reference,parsing,words # File the actual words if actualWord not in self.actualWordsToNormalized: self.actualWordsToNormalized[actualWord] = [([reference],normalizedWord,)] #dPrint( 'Quiet', debuggingThisModule, "Saved", actualWord, "with", self.actualWordsToNormalized[actualWord] ) else: # we've already had this word before previous = self.actualWordsToNormalized[actualWord] #dPrint( 'Quiet', debuggingThisModule, "had", actualWord, "before with", previous, "now with", reference, normalizedWord ) found = changed = False newList = [] for oldRefList,oldnormalizedWord in previous: #dPrint( 'Quiet', debuggingThisModule, " oRL", oldRefList, "oP", oldnormalizedWord ) if normalizedWord == oldnormalizedWord: assert not found if reference not in oldRefList: oldRefList.append( reference ) newList.append( (oldRefList,oldnormalizedWord,) ) changed = True found = True else: newList.append( (oldRefList,oldnormalizedWord,) ) if not found: #dPrint( 'Quiet', debuggingThisModule, " Found a new", normalizedWord, "normalized word for", actualWord, "was", previous ) newList.append( ([reference],normalizedWord,) ) changed = True if changed: self.actualWordsToNormalized[actualWord] = newList #dPrint( 'Quiet', debuggingThisModule, " now have", newList ) # File the normalized words if normalizedWord not in self.normalizedWordsToActual: self.normalizedWordsToActual[normalizedWord] = [([reference],actualWord,)] #dPrint( 'Quiet', debuggingThisModule, "Saved", normalizedWord, "with", self.normalizedWordsToActual[normalizedWord] ) else: # we've already had this word before previous = self.normalizedWordsToActual[normalizedWord] #dPrint( 'Quiet', debuggingThisModule, "had", normalizedWord, "before with", previous, "now with", reference, actualWord ) found = changed = False newList = [] for oldRefList,oldActualWord in previous: #dPrint( 'Quiet', debuggingThisModule, " oRL", oldRefList, "oP", oldActualWord ) if actualWord == oldActualWord: assert not found if reference not in oldRefList: oldRefList.append( reference ) newList.append( (oldRefList,oldActualWord,) ) changed = True found = True else: newList.append( (oldRefList,oldActualWord,) ) if not found: newList.append( ([reference],actualWord,) ) changed = True if changed: self.normalizedWordsToActual[normalizedWord] = newList #dPrint( 'Quiet', debuggingThisModule, " now have", newList ) if normalizedWord not in self.normalizedWordsToParsing: self.normalizedWordsToParsing[normalizedWord] = [([reference],parsing,)] #dPrint( 'Quiet', debuggingThisModule, "Saved", normalizedWord, "with", self.normalizedWordsToParsing[normalizedWord] ) else: # we've already had this word before previous = self.normalizedWordsToParsing[normalizedWord] #dPrint( 'Quiet', debuggingThisModule, "had", normalizedWord, "before with", previous, "now with", reference, parsing ) found = changed = False newList = [] for oldRefList,oldParsing in previous: #dPrint( 'Quiet', debuggingThisModule, " oRL", oldRefList, "oP", oldParsing ) if parsing == oldParsing: assert not found if reference not in oldRefList: oldRefList.append( reference ) newList.append( (oldRefList,oldParsing,) ) changed = True found = True else: newList.append( (oldRefList,oldParsing,) ) if not found: newList.append( ([reference],parsing,) ) changed = True if changed: self.normalizedWordsToParsing[normalizedWord] = newList #dPrint( 'Quiet', debuggingThisModule, " now have", newList ) # File the self.lemmasToNormalizedWords if lemma not in self.lemmasToNormalizedWords: self.lemmasToNormalizedWords[lemma] = [([reference],normalizedWord,)] #dPrint( 'Quiet', debuggingThisModule, "Saved", lemma, "with", self.lemmasToNormalizedWords[lemma] ) else: # we've already had this word before previous = self.lemmasToNormalizedWords[lemma] #dPrint( 'Quiet', debuggingThisModule, "had", lemma, "before with", previous, "now with", reference, normalizedWord ) found = changed = False newList = [] for oldRefList,oldnormalizedWord in previous: #dPrint( 'Quiet', debuggingThisModule, " oRL", oldRefList, "oP", oldnormalizedWord ) if normalizedWord == oldnormalizedWord: assert not found if reference not in oldRefList: oldRefList.append( reference ) newList.append( (oldRefList,oldnormalizedWord,) ) changed = True found = True else: newList.append( (oldRefList,oldnormalizedWord,) ) if not found: newList.append( ([reference],normalizedWord,) ) changed = True if changed: self.lemmasToNormalizedWords[lemma] = newList #dPrint( 'Quiet', debuggingThisModule, " now have", newList ) vPrint( 'Info', debuggingThisModule, "analyzeWords: NT has {} Greek words".format( self.wordCounts['Total'] ) ) vPrint( 'Info', debuggingThisModule, "analyzeWords: NT has {} actual Greek words".format( len(self.actualWordsToNormalized) ) ) if BibleOrgSysGlobals.verbosityLevel > 3: for j,aW in enumerate( self.actualWordsToNormalized.keys() ): vPrint( 'Quiet', debuggingThisModule, " ", aW, self.actualWordsToNormalized[aW] ) if j==6: break vPrint( 'Info', debuggingThisModule, "analyzeWords: NT has {} normalized Greek words".format( len(self.normalizedWordsToActual) ) ) if BibleOrgSysGlobals.verbosityLevel > 3: for j,nW in enumerate( self.normalizedWordsToActual.keys() ): vPrint( 'Quiet', debuggingThisModule, " ", nW, self.normalizedWordsToActual[nW] ) if j==6: break vPrint( 'Info', debuggingThisModule, "analyzeWords: NT has {} normalized Greek words".format( len(self.normalizedWordsToParsing) ) ) if BibleOrgSysGlobals.verbosityLevel > 3: for j,nW in enumerate( self.normalizedWordsToParsing.keys() ): vPrint( 'Quiet', debuggingThisModule, " ", nW, self.normalizedWordsToParsing[nW] ) if j==6: break vPrint( 'Info', debuggingThisModule, "analyzeWords: NT has {} Greek self.lemmasToNormalizedWords".format( len(self.lemmasToNormalizedWords) ) ) if BibleOrgSysGlobals.verbosityLevel > 3: for j,lem in enumerate( self.lemmasToNormalizedWords.keys() ): vPrint( 'Quiet', debuggingThisModule, " ", lem, self.lemmasToNormalizedWords[lem] ) if j==6: break if 0: vPrint( 'Quiet', debuggingThisModule, "The following actual words have multiple normalized forms:" ) for j,aW in enumerate( self.actualWordsToNormalized.keys() ): if len(self.actualWordsToNormalized[aW])>1: vPrint( 'Quiet', debuggingThisModule, " ", aW ) for entry in self.actualWordsToNormalized[aW]: vPrint( 'Quiet', debuggingThisModule, " ", entry[1], self.normalizedWordsToParsing[entry[1]], entry[0] )
def loadBook( self, BBB:str, filename, encoding='utf-8' ): def unpackLine( line ): # Should be seven parts in the line # 0 book/chapter/verse # 1 part of speech (POS) # 2 parsing code # 3 text (including punctuation) # 4 word (with punctuation stripped) # 5 normalized word # 6 lemma # e.g., 180101 N- ----NSM- Παῦλος Παῦλος Παῦλος Παῦλος # 180102 N- ----DSF- ⸀ἀδελφῇ ἀδελφῇ ἀδελφῇ ἀδελφή # 180102 P- -------- κατ’ κατ’ κατά κατά # 180102 N- ----DSF- ἐκκλησίᾳ· ἐκκλησίᾳ ἐκκλησίᾳ ἐκκλησία bits = line.split() assert len(bits) == 7 #dPrint( 'Quiet', debuggingThisModule, bits ) bn, cn, vn = bits[0][0:2], bits[0][2:4], bits[0][4:6] if bn[0]=='0': bn = bn[1:] # Remove any leading zero if cn[0]=='0': cn = cn[1:] # Remove any leading zero if vn[0]=='0': vn = vn[1:] # Remove any leading zero #dPrint( 'Quiet', debuggingThisModule, b, c, v ) POSCode = bits[1] assert len(POSCode) == 2 assert POSCode in Greek.POSCodes.keys() parsingCode = bits[2] assert len(parsingCode) == 8 #dPrint( 'Quiet', debuggingThisModule, parsingCode ) for j,char in enumerate(parsingCode): assert char in Greek.parsingCodes[j] assert parsingCode[0] in Greek.personCodes assert parsingCode[1] in Greek.tenseCodes assert parsingCode[2] in Greek.voiceCodes assert parsingCode[3] in Greek.modeCodes assert parsingCode[4] in Greek.caseCodes assert parsingCode[5] in Greek.numberCodes assert parsingCode[6] in Greek.genderCodes assert parsingCode[7] in Greek.degreeCodes return (bn,cn,vn,), (POSCode,parsingCode,), (bits[3],bits[4],bits[5],bits[6],) # end of unpackLine self.thisBook = BibleBook( self, BBB ) self.thisBook.objectNameString = 'Morph Greek NT Bible Book object' self.thisBook.objectTypeString = 'MorphGNT' filepath = os.path.join( self.sourceFilepath, filename ) vPrint( 'Info', debuggingThisModule, " Loading {}…".format( filename ) ) lastLine, lineCount = '', 0 lastC = lastV = None with open( filepath, encoding=encoding ) as myFile: # Automatically closes the file when done if 1: #try: for line in myFile: lineCount += 1 if lineCount==1 and encoding.lower()=='utf-8' and line and line[0]==chr(65279): #U+FEFF logging.info( "GreekNT: Detected Unicode Byte Order Marker (BOM) in {}".format( filename ) ) line = line[1:] # Remove the Unicode Byte Order Marker (BOM) if line and line[-1]=='\n': line = line[:-1] # Removing trailing newline character #if not line: continue # Just discard blank lines lastLine = line #dPrint( 'Quiet', debuggingThisModule, 'gNT file line is "' + line + '"' ) #if line[0]=='#': continue # Just discard comment lines unpackedLine = unpackLine( line ) #dPrint( 'Quiet', debuggingThisModule, unpackedLine ) ref, grammar, words = unpackedLine bn, cn, vn = ref POSCode, parsingCode = grammar word1, word2, word3, word4 = words if cn != lastC: self.thisBook.addLine( 'c', cn ) lastC, lastV = cn, None if vn != lastV: self.thisBook.addLine( 'v', vn ) lastV = vn self.thisBook.addLine( 'vw', "{}/{}/{}/{}".format( word1, word2, word3, word4 ) ) self.thisBook.addLine( 'g', "{}/{}".format( POSCode, parsingCode ) ) #reference = BBB,bits[0][1],bits[0][2], # Put the BBB into the reference #lineTuples.append( (reference,bits[1],bits[2],) ) #dPrint( 'Quiet', debuggingThisModule, reference,bits[1],bits[2] ); halt #if 0: #except: #logging.critical( "Invalid line in " + filepath + " -- line ignored at " + str(lineCount) ) #if lineCount > 1: vPrint( 'Quiet', debuggingThisModule, 'Previous line was: ', lastLine ) #else: vPrint( 'Quiet', debuggingThisModule, 'Possible encoding error -- expected', encoding ) if self.thisBook: vPrint( 'Verbose', debuggingThisModule, " {} words loaded from {}".format( len(self.thisBook), filename ) ) self.stashBook( self.thisBook )
def loadBook(self, BBB: str): """ Load the requested book out of the SQLite3 database. """ fnPrint(debuggingThisModule, "loadBook( {} )".format(BBB)) assert self.preloadDone if BBB in self.books: dPrint('Quiet', debuggingThisModule, " {} is already loaded -- returning".format(BBB)) return # Already loaded if BBB in self.triedLoadingBook: logging.warning( "We had already tried loading MySwordBible {} for {}".format( BBB, self.name)) return # We've already attempted to load this book self.triedLoadingBook[BBB] = True self.bookNeedsReloading[BBB] = False if BibleOrgSysGlobals.verbosityLevel > 2 or BibleOrgSysGlobals.debugFlag: vPrint( 'Quiet', debuggingThisModule, _("MySwordBible: Loading {} from {}…").format( BBB, self.sourceFilepath)) #if self.suppliedMetadata['MySword']['OT'] and self.suppliedMetadata['MySword']['NT']: #testament, BBB = 'BOTH', 'GEN' #booksExpected, textLineCountExpected = 1, 31102 #elif self.suppliedMetadata['MySword']['OT']: #testament, BBB = 'OT', 'GEN' #booksExpected, textLineCountExpected = 1, 23145 #elif self.suppliedMetadata['MySword']['NT']: #testament, BBB = 'NT', 'MAT' #booksExpected, textLineCountExpected = 1, 7957 # Create the first book thisBook = BibleBook(self, BBB) thisBook.objectNameString = 'MySword Bible Book object' thisBook.objectTypeString = 'MySword' verseList = self.BibleOrganisationalSystem.getNumVersesList(BBB) numC, numV = len(verseList), verseList[0] nBBB = BibleOrgSysGlobals.loadedBibleBooksCodes.getReferenceNumber(BBB) C = V = 1 #bookCount = 0 ourGlobals = {} continued = ourGlobals['haveParagraph'] = False haveLines = False while True: self.cursor.execute( 'select Scripture from Bible where Book=? and Chapter=? and Verse=?', (nBBB, C, V)) try: row = self.cursor.fetchone() line = row[0] except TypeError: # This reference is missing (row is None) #dPrint( 'Quiet', debuggingThisModule, "something wrong at", BBB, C, V ) #if BibleOrgSysGlobals.debugFlag: halt #dPrint( 'Quiet', debuggingThisModule, row ) line = None #dPrint( 'Quiet', debuggingThisModule, nBBB, BBB, C, V, 'MySw file line is "' + line + '"' ) if line is None: logging.warning( "MySwordBible.load: Have missing verse line at {} {}:{}". format(BBB, C, V)) else: # line is not None if not isinstance(line, str): if 'encryption' in self.suppliedMetadata['MySword']: logging.critical( "MySwordBible.load: Unable to decrypt verse line at {} {}:{} {!r}" .format(BBB, C, V, line)) break else: logging.critical( "MySwordBible.load: Unable to decode verse line at {} {}:{} {!r} {}" .format(BBB, C, V, line, self.suppliedMetadata['MySword'])) elif not line: logging.warning( "MySwordBible.load: Found blank verse line at {} {}:{}" .format(BBB, C, V)) else: haveLines = True # Some modules end lines with \r\n or have it in the middle! # (We just ignore these for now) while line and line[-1] in '\r\n': line = line[:-1] if '\r' in line or '\n' in line: # (in the middle) logging.warning( "MySwordBible.load: Found CR or LF characters in verse line at {} {}:{}" .format(BBB, C, V)) line = line.replace('\r\n', ' ').replace('\r', ' ').replace('\n', ' ') #dPrint( 'Quiet', debuggingThisModule, "MySword.load", BBB, C, V, repr(line) ) handleRTFLine(self.name, BBB, C, V, line, thisBook, ourGlobals) V += 1 if V > numV: C += 1 if C <= numC: # next chapter only #thisBook.addLine( 'c', str(C) ) numV = verseList[C - 1] V = 1 else: # Save this book now if haveLines: vPrint('Info', debuggingThisModule, " MySword saving", BBB) self.stashBook(thisBook) #else: vPrint( 'Quiet', debuggingThisModule, "Not saving", BBB ) break if ourGlobals['haveParagraph']: thisBook.addLine('p', '') ourGlobals['haveParagraph'] = False
def load(self): """ Load a single source file and load book elements. """ vPrint('Info', debuggingThisModule, _("Loading {}…").format(self.sourceFilepath)) global BOS66, BOS81, BOSx if BOS66 is None: BOS66 = BibleOrganisationalSystem('GENERIC-KJV-66-ENG') if BOS81 is None: BOS81 = BibleOrganisationalSystem('GENERIC-KJV-80-ENG') if BOSx is None: BOSx = BibleOrganisationalSystem('GENERIC-ENG') if self.suppliedMetadata is None: self.suppliedMetadata = {} lastLine, lineCount = '', 0 bookCode = BBB = metadataName = None lastBookCode = lastChapterNumber = lastVerseNumber = -1 lastVText = '' thisBook = None settingsDict = {} with open(self.sourceFilepath, encoding=self.encoding ) as myFile: # Automatically closes the file when done for line in myFile: lineCount += 1 if line[-1] == '\n': line = line[:-1] # Removing trailing newline character if not line: continue # Just discard blank lines if lineCount == 1: if self.encoding.lower() == 'utf-8' and line[0] == chr( 65279): #U+FEFF or \ufeff logging.info( " ForgeForSwordSearcherBible.load: Detected Unicode Byte Order Marker (BOM)" ) line = line[ 1:] # Remove the Unicode Byte Order Marker (BOM) match = re.search('^; TITLE:\\s', line) if match: if BibleOrgSysGlobals.debugFlag: vPrint( 'Quiet', debuggingThisModule, "First line got type {!r} match from {!r}". format(match.group(0), line)) else: vPrint( 'Verbose', debuggingThisModule, "ForgeForSwordSearcherBible.load: (unexpected) first line was {!r} in {}" .format(firstLine, thisFilename)) if BibleOrgSysGlobals.debugFlag and debuggingThisModule: halt continue #vPrint( 'Quiet', debuggingThisModule, 'ForgeForSwordSearcher file line is "' + line + '"' ) lastLine = line # Process header stuff if line.startswith('; TITLE:'): string = line[8:].strip() if string: settingsDict['TITLE'] = string continue elif line.startswith('; ABBREVIATION:'): string = line[15:].strip() if string: settingsDict['ABBREVIATION'] = string continue elif line.startswith('; HAS ITALICS'): string = line[14:].strip() if string: settingsDict['HAS_ITALICS'] = string continue elif line.startswith('; HAS FOOTNOTES:'): string = line[15:].strip() if string: settingsDict['HAS_FOOTNOTES'] = string continue elif line.startswith('; HAS FOOTNOTES'): string = line[14:].strip() if string: settingsDict['HAS_FOOTNOTES'] = string continue elif line.startswith('; HAS REDLETTER'): string = line[14:].strip() if string: settingsDict['HAS_REDLETTER'] = string continue elif line[0] == ';': logging.warning( "ForgeForSwordSearcherBible.load is skipping unknown header/comment line: {}" .format(line)) continue # Just discard comment lines # Process the main segment if line.startswith('$$ '): if metadataName and metadataContents: settingsDict[metadataName] = metadataContents metadataName = None pointer = line[3:] #vPrint( 'Quiet', debuggingThisModule, "pointer", repr(pointer) ) if pointer and pointer[0] == '{' and pointer[-1] == '}': metadataName = pointer[1:-1] if metadataName: #vPrint( 'Quiet', debuggingThisModule, "metadataName", repr(metadataName) ) metadataContents = '' else: # let's assume it's a BCV reference pointer = pointer.replace( '1 K','1K' ).replace( '2 K','2K' ) \ .replace( '1 Chr','1Chr' ).replace( '2 Chr','2Chr' ) \ .replace( '1 Cor','1Cor' ).replace( '2 Cor','2Cor' ) \ .replace( '1 Thess','1Thess' ).replace( '2 Thess','2Thess' ) \ .replace( '1 Tim','1Tim' ).replace( '2 Tim','2Tim' ) \ .replace( '1 Pet','1Pet' ).replace( '2 Pet','2Pet' ) \ .replace( '1 J','1J' ).replace( '2 J','2J' ).replace( '3 J','3J' ) B_CV_Bits = pointer.split(' ', 1) if len(B_CV_Bits) == 2 and ':' in B_CV_Bits[1]: bookCode, CVString = B_CV_Bits chapterNumberString, verseNumberString = CVString.split( ':') chapterNumber = int(chapterNumberString) verseNumber = int(verseNumberString) if bookCode != lastBookCode: # We've started a new book if bookCode in ('Ge', ): BBB = 'GEN' elif bookCode in ('Le', ): BBB = 'LEV' elif bookCode in ('La', ): BBB = 'LAM' ##elif bookCode in ('Es',): BBB = 'EST' ##elif bookCode in ('Pr',): BBB = 'PRO' #elif bookCode in ('So',): BBB = 'SNG' #elif bookCode in ('La',): BBB = 'LAM' #elif bookCode in ('Jude',): BBB = 'JDE' else: #vPrint( 'Quiet', debuggingThisModule, "4BookCode =", repr(bookCode) ) #BBB = BOS.getBBBFromText( bookCode ) # Try to guess BBB = BOS66.getBBBFromText( bookCode) # Try to guess if not BBB: BBB = BOS81.getBBBFromText( bookCode) # Try to guess if not BBB: BBB = BOSx.getBBBFromText( bookCode) # Try to guess #vPrint( 'Quiet', debuggingThisModule, "4BBB =", repr(BBB) ) else: vPrint('Quiet', debuggingThisModule, "Unexpected number of bits", self.givenName, BBB, bookCode, chapterNumberString, verseNumberString, len(bits), bits) continue # Just save the pointer information which refers to the text on the next line else: # it's not a $$ line text = line #vPrint( 'Quiet', debuggingThisModule, "text", repr(text) ) if metadataName: metadataContents += ('\n' if metadataContents else '') + text continue else: vText = text # Handle bits like (<scripref>Pr 2:7</scripref>) vText = vText.replace('(<scripref>', '\\x - \\xt ').replace( '</scripref>)', '\\x*') vText = vText.replace('<scripref>', '\\x - \\xt ').replace( '</scripref>', '\\x*') #if '\\' in vText: vPrint( 'Quiet', debuggingThisModule, 'ForgeForSwordSearcher vText', repr(vText) ) #vPrint( 'Quiet', debuggingThisModule, BBB, chapterNumber, verseNumber, repr(vText) ) # Convert {stuff} to footnotes match = re.search('\\{(.+?)\\}', vText) while match: footnoteText = '\\f + \\fr {}:{} \\ft {}\\f*'.format( chapterNumber, verseNumber, match.group(1)) vText = vText[:match.start( )] + footnoteText + vText[ match.end():] # Replace this footnote #vPrint( 'Quiet', debuggingThisModule, BBB, chapterNumber, verseNumber, repr(vText) ) match = re.search('\\{(.+?)\\}', vText) # Convert [stuff] to added fields match = re.search('\\[(.+?)\\]', vText) while match: addText = '\\add {}\\add*'.format(match.group(1)) vText = vText[:match.start()] + addText + vText[ match.end():] # Replace this chunk #vPrint( 'Quiet', debuggingThisModule, BBB, chapterNumber, verseNumber, repr(vText) ) match = re.search('\\[(.+?)\\]', vText) # Convert +r/This text is red-letter-r/ to wj fields match = re.search('\\+r/(.+?)-r/', vText) while match: addText = '\\wj {}\\wj*'.format(match.group(1)) vText = vText[:match.start()] + addText + vText[ match.end():] # Replace this chunk #vPrint( 'Quiet', debuggingThisModule, BBB, chapterNumber, verseNumber, repr(vText) ) match = re.search('\\+r/(.+?)-r/', vText) # Final check for unexpected remaining formatting for badChar in '{}[]/': if badChar in vText: logging.warning( "Found remaining braces,brackets or slashes in SwordSearcher Forge VPL {} {}:{} {!r}" .format(BBB, chapterNumberString, verseNumberString, vText)) break if bookCode: if bookCode != lastBookCode: # We've started a new book if lastBookCode != -1: # Better save the last book self.stashBook(thisBook) if BBB: if BBB in self: logging.critical( "Have duplicated {} book in {}".format( self.givenName, BBB)) if BibleOrgSysGlobals.debugFlag: assert BBB not in self thisBook = BibleBook(self, BBB) thisBook.objectNameString = 'ForgeForSwordSearcher Bible Book object' thisBook.objectTypeString = 'ForgeForSwordSearcher' verseList = BOSx.getNumVersesList(BBB) numChapters, numVerses = len( verseList), verseList[0] lastBookCode = bookCode lastChapterNumber = lastVerseNumber = -1 else: logging.critical( "ForgeForSwordSearcherBible could not figure out {!r} book code" .format(bookCode)) if BibleOrgSysGlobals.debugFlag: halt if BBB: if chapterNumber != lastChapterNumber: # We've started a new chapter if BibleOrgSysGlobals.debugFlag: assert chapterNumber > lastChapterNumber or BBB == 'ESG' # Esther Greek might be an exception if chapterNumber == 0: logging.info( "Have chapter zero in {} {} {} {}:{}". format(self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) elif chapterNumber > numChapters: logging.error( "Have high chapter number in {} {} {} {}:{} (expected max of {})" .format(self.givenName, BBB, bookCode, chapterNumberString, verseNumberString, numChapters)) thisBook.addLine('c', chapterNumberString) lastChapterNumber = chapterNumber lastVerseNumber = -1 # Handle the verse info if verseNumber == lastVerseNumber and vText == lastVText: logging.warning( _("Ignored duplicate verse line in {} {} {} {}:{}" ).format(self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) continue if verseNumber < lastVerseNumber: logging.warning( _("Ignored receding verse number (from {} to {}) in {} {} {} {}:{}" ).format(lastVerseNumber, verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) elif verseNumber == lastVerseNumber: if vText == lastVText: logging.warning( _("Ignored duplicated {} verse in {} {} {} {}:{}" ).format(verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) else: logging.warning( _("Ignored duplicated {} verse number in {} {} {} {}:{}" ).format(verseNumber, self.givenName, BBB, bookCode, chapterNumberString, verseNumberString)) # Check for paragraph markers if vText and vText[0] == '¶': thisBook.addLine('p', '') vText = vText[1:].lstrip() #vPrint( 'Quiet', debuggingThisModule, '{} {}:{} = {!r}'.format( BBB, chapterNumberString, verseNumberString, vText ) ) thisBook.addLine('v', verseNumberString + ' ' + vText) lastVText = vText lastVerseNumber = verseNumber else: # No bookCode yet logging.warning( "ForgeForSwordSearcherBible.load is skipping unknown pre-book line: {}" .format(line)) # Save the final book if thisBook is not None: self.stashBook(thisBook) # Clean up if settingsDict: #vPrint( 'Quiet', debuggingThisModule, "ForgeForSwordSearcher settingsDict", settingsDict ) if self.suppliedMetadata is None: self.suppliedMetadata = {} self.suppliedMetadata['Forge4SS'] = settingsDict self.applySuppliedMetadata( 'Forge4SS') # Copy some to self.settingsDict self.doPostLoadProcessing()
def load(self): """ Load all the books out of the SQLite3 database. """ fnPrint(debuggingThisModule, "load()…") assert self.preloadDone vPrint('Info', debuggingThisModule, _("Loading {}…").format(self.sourceFilepath)) if self.suppliedMetadata['MySword']['OT'] and self.suppliedMetadata[ 'MySword']['NT']: testament, BBB = 'BOTH', 'GEN' booksExpected, textLineCountExpected = 66, 31102 elif self.suppliedMetadata['MySword']['OT']: testament, BBB = 'OT', 'GEN' booksExpected, textLineCountExpected = 39, 23145 elif self.suppliedMetadata['MySword']['NT']: testament, BBB = 'NT', 'MAT' booksExpected, textLineCountExpected = 27, 7957 # Create the first book thisBook = BibleBook(self, BBB) thisBook.objectNameString = 'MySword Bible Book object' thisBook.objectTypeString = 'MySword' verseList = self.BibleOrganisationalSystem.getNumVersesList(BBB) numC, numV = len(verseList), verseList[0] nBBB = BibleOrgSysGlobals.loadedBibleBooksCodes.getReferenceNumber(BBB) C = V = 1 bookCount = 0 ourGlobals = {} continued = ourGlobals['haveParagraph'] = False haveLines = False while True: self.cursor.execute( 'select Scripture from Bible where Book=? and Chapter=? and Verse=?', (nBBB, C, V)) try: row = self.cursor.fetchone() line = row[0] except TypeError: # This reference is missing (row is None) #dPrint( 'Quiet', debuggingThisModule, "something wrong at", BBB, C, V ) #if BibleOrgSysGlobals.debugFlag: halt #dPrint( 'Quiet', debuggingThisModule, row ) line = None #dPrint( 'Quiet', debuggingThisModule, nBBB, BBB, C, V, 'MySw file line is "' + line + '"' ) if line is None: logging.warning( "MySwordBible.load: Have missing verse line at {} {}:{}". format(BBB, C, V)) else: # line is not None if not isinstance(line, str): if 'encryption' in self.suppliedMetadata['MySword']: logging.critical( "MySwordBible.load: Unable to decrypt verse line at {} {}:{} {!r}" .format(BBB, C, V, line)) break else: logging.critical( "MySwordBible.load: Unable to decode verse line at {} {}:{} {!r} {}" .format(BBB, C, V, line, self.suppliedMetadata['MySword'])) elif not line: logging.warning( "MySwordBible.load: Found blank verse line at {} {}:{}" .format(BBB, C, V)) else: haveLines = True # Some modules end lines with \r\n or have it in the middle! # (We just ignore these for now) while line and line[-1] in '\r\n': line = line[:-1] if '\r' in line or '\n' in line: # (in the middle) logging.warning( "MySwordBible.load: Found CR or LF characters in verse line at {} {}:{}" .format(BBB, C, V)) line = line.replace('\r\n', ' ').replace('\r', ' ').replace('\n', ' ') #dPrint( 'Quiet', debuggingThisModule, "MySword.load", BBB, C, V, repr(line) ) handleRTFLine(self.name, BBB, C, V, line, thisBook, ourGlobals) V += 1 if V > numV: C += 1 if C > numC: # Save this book now if haveLines: vPrint('Verbose', debuggingThisModule, " MySword saving", BBB, bookCount + 1) self.stashBook(thisBook) #else: vPrint( 'Quiet', debuggingThisModule, "Not saving", BBB ) bookCount += 1 # Not the number saved but the number we attempted to process if bookCount >= booksExpected: break BBB = self.BibleOrganisationalSystem.getNextBookCode(BBB) # Create the next book thisBook = BibleBook(self, BBB) thisBook.objectNameString = 'MySword Bible Book object' thisBook.objectTypeString = 'MySword' haveLines = False verseList = self.BibleOrganisationalSystem.getNumVersesList( BBB) numC, numV = len(verseList), verseList[0] nBBB = BibleOrgSysGlobals.loadedBibleBooksCodes.getReferenceNumber( BBB) C = V = 1 #thisBook.addLine( 'c', str(C) ) else: # next chapter only #thisBook.addLine( 'c', str(C) ) numV = verseList[C - 1] V = 1 if ourGlobals['haveParagraph']: thisBook.addLine('p', '') ourGlobals['haveParagraph'] = False self.cursor.close() del self.cursor self.applySuppliedMetadata('MySword') # Copy some to self.settingsDict self.doPostLoadProcessing()
def load( self ): """ Load a single source file and load book elements. """ vPrint( 'Info', debuggingThisModule, _("Loading {}…").format( self.sourceFilepath ) ) global BOS66, BOS81, BOSx if BOS66 is None: BOS66 = BibleOrganisationalSystem( 'GENERIC-KJV-66-ENG' ) if BOS81 is None: BOS81 = BibleOrganisationalSystem( 'GENERIC-KJV-80-ENG' ) if BOSx is None: BOSx = BibleOrganisationalSystem( 'GENERIC-ENG' ) if self.suppliedMetadata is None: self.suppliedMetadata = {} lastLine, lineCount = '', 0 vplType = bookCodeText = lastBookCodeText = BBB = lastBBB = metadataName = None lastChapterNumber = lastVerseNumber = -1 lastVText = '' thisBook = None settingsDict = {} with open( self.sourceFilepath, encoding=self.encoding ) as myFile: # Automatically closes the file when done for line in myFile: lineCount += 1 if line[-1]=='\n': line=line[:-1] # Removing trailing newline character if not line: continue # Just discard blank lines if lineCount==1: if self.encoding.lower()=='utf-8' and line[0]==chr(65279): #U+FEFF or \ufeff logging.info( " VPLBible.load: Detected Unicode Byte Order Marker (BOM)" ) line = line[1:] # Remove the Unicode Byte Order Marker (BOM) # Try to identify the VPL type match = re.search( '^(\\w{2,5}?)\\s(\\d{1,3})[:\\.](\\d{1,3})\\s', line ) if match: vplType = 1 else: match = re.search( '^(\\d{8})\\s', line ) if match: vplType = 2 else: match = re.search( '^# language_name:\\s', line ) if match: vplType = 3 #else: #match = re.search( '^; TITLE:\\s', line ) #if match: vplType = 4 if match: if BibleOrgSysGlobals.debugFlag: vPrint( 'Quiet', debuggingThisModule, "First line got type #{} {!r} match from {!r}".format( vplType, match.group(0), line ) ) else: vPrint( 'Verbose', debuggingThisModule, "VPLBible.load: (unexpected) first line was {!r} in {}".format( line, self.sourceFilepath ) ) if BibleOrgSysGlobals.debugFlag and debuggingThisModule: halt continue #vPrint( 'Quiet', debuggingThisModule, 'vplType', vplType ) #vPrint( 'Quiet', debuggingThisModule, 'VPL file line is "' + line + '"' ) lastLine = line # Process header stuff if vplType == 3: if line.startswith( '# language_name:' ): string = line[16:].strip() if string and string != 'Not available': settingsDict['LanguageName'] = string continue elif line.startswith( '# closest ISO 639-3:' ): string = line[20:].strip() if string and string != 'Not available': settingsDict['ISOLanguageCode'] = string continue elif line.startswith( '# year_short:' ): string = line[13:].strip() if string and string != 'Not available': settingsDict['Year.short'] = string continue elif line.startswith( '# year_long:' ): string = line[12:].strip() if string and string != 'Not available': settingsDict['Year.long'] = string continue elif line.startswith( '# title:' ): string = line[8:].strip() if string and string != 'Not available': settingsDict['WorkTitle'] = string continue elif line.startswith( '# URL:' ): string = line[6:].strip() if string and string != 'Not available': settingsDict['URL'] = string continue elif line.startswith( '# copyright_short:' ): string = line[18:].strip() if string and string != 'Not available': settingsDict['Copyright.short'] = string continue elif line.startswith( '# copyright_long:' ): string = line[17:].strip() if string and string != 'Not available': settingsDict['Copyright.long'] = string continue elif line[0]=='#': logging.warning( "VPLBible.load {} is skipping unknown line: {}".format( vplType, line ) ) continue # Just discard comment lines #elif vplType == 4: #if line.startswith( '; TITLE:' ): #string = line[8:].strip() #if string: settingsDict['TITLE'] = string #continue #elif line.startswith( '; ABBREVIATION:' ): #string = line[15:].strip() #if string: settingsDict['ABBREVIATION'] = string #continue #elif line.startswith( '; HAS ITALICS:' ): #string = line[15:].strip() #if string: settingsDict['HAS_ITALICS'] = string #continue #elif line.startswith( '; HAS FOOTNOTES:' ): #string = line[15:].strip() #if string: settingsDict['HAS_FOOTNOTES'] = string #continue #elif line.startswith( '; HAS FOOTNOTES' ): #string = line[14:].strip() #if string: settingsDict['HAS_FOOTNOTES'] = string #continue #elif line.startswith( '; HAS REDLETTER:' ): #string = line[15:].strip() #if string: settingsDict['HAS_REDLETTER'] = string #continue #elif line[0]==';': #logging.warning( "VPLBible.load{} is skipping unknown header/comment line: {}".format( vplType, line ) ) #continue # Just discard comment lines # Process the main segment if vplType == 1: bits = line.split( ' ', 2 ) #vPrint( 'Quiet', debuggingThisModule, self.givenName, BBB, bits ) if len(bits) == 3 and ':' in bits[1]: bookCodeText, CVString, vText = bits chapterNumberString, verseNumberString = CVString.split( ':' ) #vPrint( 'Quiet', debuggingThisModule, "{} {} bc={!r} c={!r} v={!r} txt={!r}".format( self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString, vText ) ) if chapterNumberString == '': chapterNumberString = '1' # Handle a bug in some single chapter books in VPL else: vPrint( 'Quiet', debuggingThisModule, "Unexpected number of bits", self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString, len(bits), bits ) if not bookCodeText and not chapterNumberString and not verseNumberString: vPrint( 'Quiet', debuggingThisModule, "Skipping empty line in {} {} {} {}:{}".format( self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString ) ) continue if BibleOrgSysGlobals.debugFlag: assert 2 <= len(bookCodeText) <= 4 if BibleOrgSysGlobals.debugFlag: assert chapterNumberString.isdigit() if not verseNumberString.isdigit(): logging.error( "Invalid verse number field at {}/{} {}:{!r}".format( bookCodeText, BBB, chapterNumberString, verseNumberString ) ) if BibleOrgSysGlobals.debugFlag and debuggingThisModule: assert verseNumberString.isdigit() continue chapterNumber = int( chapterNumberString ) verseNumber = int( verseNumberString ) if bookCodeText != lastBookCodeText: # We've started a new book lastBBB = BBB #if bookCodeText in ('Ge',): BBB = 'GEN' if bookCodeText == 'Le' and lastBBB == 'GEN': BBB = 'LEV' elif bookCodeText in ('Jud',) and lastBBB == 'JOS': BBB = 'JDG' #elif bookCodeText in ('Es',): BBB = 'EST' #elif bookCodeText in ('Pr',): BBB = 'PRO' #elif bookCodeText in ('So','SOL') and lastBBB == 'ECC': BBB = 'SNG' #elif bookCodeText in ('La',) and lastBBB == 'JER': BBB = 'LAM' #elif bookCodeText == 'PHI' and lastBBB == 'EPH': BBB = 'PHP' #elif bookCodeText == 'PHI' and self.givenName == "bjp_vpl": BBB = 'PHP' # Hack for incomplete NT #elif bookCodeText in ('Jude',): BBB = 'JDE' #elif bookCodeText == 'PRA' and lastBBB == 'LJE': BBB = 'PAZ' #elif bookCodeText == 'PRM' and lastBBB == 'GES': BBB = 'MAN' else: BBB = BOS66.getBBBFromText( bookCodeText ) # Try to guess if not BBB: BBB = BOS81.getBBBFromText( bookCodeText ) # Try to guess if not BBB: BBB = BOSx.getBBBFromText( bookCodeText ) # Try to guess if not BBB: BBB = BibleOrgSysGlobals.loadedBibleBooksCodes.getBBBFromText( bookCodeText ) # Try to guess if not BBB: logging.critical( "VPL Bible: Unable to determine book code from text {!r} after {!r}={}".format( bookCodeText, lastBookCodeText, lastBBB ) ) halt # Handle special formatting # [square-brackets] are for Italicized words # <angle-brackets> are for the Words of Christ in Red # «chevrons» are for the Titles in the Book of Psalms. vText = vText.replace( '[', '\\add ' ).replace( ']', '\\add*' ) \ .replace( '<', '\\wj ' ).replace( '>', '\\wj*' ) if vText and vText[0]=='«': #vPrint( 'Quiet', debuggingThisModule, "Oh!", BBB, chapterNumberString, verseNumberString, repr(vText) ) if BBB=='PSA' and verseNumberString=='1': # Psalm title vBits = vText[1:].split( '»' ) #vPrint( 'Quiet', debuggingThisModule, "vBits", vBits ) thisBook.addLine( 'd', vBits[0] ) # Psalm title vText = vBits[1].lstrip() # Handle the verse info #if verseNumber==lastVerseNumber and vText==lastVText: #logging.warning( _("Ignored duplicate verse line in {} {} {} {}:{}").format( self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString ) ) #continue if BBB=='PSA' and verseNumberString=='1' and vText.startswith('<') and self.givenName=='basic_english': # Move Psalm titles to verse zero verseNumber = 0 #if verseNumber < lastVerseNumber: #logging.warning( _("Ignored receding verse number (from {} to {}) in {} {} {} {}:{}").format( lastVerseNumber, verseNumber, self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString ) ) #elif verseNumber == lastVerseNumber: #if vText == lastVText: #logging.warning( _("Ignored duplicated {} verse in {} {} {} {}:{}").format( verseNumber, self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString ) ) #else: #logging.warning( _("Ignored duplicated {} verse number in {} {} {} {}:{}").format( verseNumber, self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString ) ) elif vplType in (2,3): bits = line.split( '\t', 1 ) #vPrint( 'Quiet', debuggingThisModule, self.givenName, BBB, bits ) bookNumberString, chapterNumberString, verseNumberString = bits[0][:2], bits[0][2:5], bits[0][5:] #vPrint( 'Quiet', debuggingThisModule, bookNumberString, chapterNumberString, verseNumberString ) chapterNumberString = chapterNumberString.lstrip( '0' ) # Remove leading zeroes verseNumberString = verseNumberString.lstrip( '0' ) # Remove leading zeroes bookCodeText, chapterNumber, verseNumber = int( bookNumberString), int(chapterNumberString), int(verseNumberString) vText = bits[1].replace(' ,',',').replace(' .','.').replace(' ;',';').replace(' :',':') \ .replace(' !','!').replace(' )',')').replace(' ]',']').replace(' ”','”') \ .replace('“ ','“').replace('( ','(').replace('[ ','[') #.replace(' !','!') if bookCodeText != lastBookCodeText: # We've started a new book lastBBB = BBB bnDict = { 67:'TOB', 68:'JDT', 69:'ESG', 70:'WIS', 71:'SIR', 72:'BAR', 73:'LJE', 74:'PAZ', 75:'SUS', 76:'BEL', 77:'MA1', 78:'MA2', 79:'MA3', 80:'MA4', 81:'ES1', 82:'ES2', 83:'MAN', 84:'PS2', 85:'PSS', 86:'ODE', } if 1 <= bookCodeText <= 66: BBB = BibleOrgSysGlobals.loadedBibleBooksCodes.getBBBFromReferenceNumber( bookCodeText ) else: BBB = bnDict[bookCodeText] #elif vplType == 4: #if line.startswith( '$$ ' ): #if metadataName and metadataContents: #settingsDict[metadataName] = metadataContents #metadataName = None #pointer = line[3:] ##vPrint( 'Quiet', debuggingThisModule, "pointer", repr(pointer) ) #if pointer and pointer[0]=='{' and pointer[-1]=='}': #metadataName = pointer[1:-1] #if metadataName: ##vPrint( 'Quiet', debuggingThisModule, "metadataName", repr(metadataName) ) #metadataContents = '' #else: # let's assume it's a BCV reference #pointer = pointer.replace( '1 K','1K' ).replace( '2 K','2K' ) \ #.replace( '1 Chr','1Chr' ).replace( '2 Chr','2Chr' ) \ #.replace( '1 Cor','1Cor' ).replace( '2 Cor','2Cor' ) \ #.replace( '1 Thess','1Thess' ).replace( '2 Thess','2Thess' ) \ #.replace( '1 Tim','1Tim' ).replace( '2 Tim','2Tim' ) \ #.replace( '1 Pet','1Pet' ).replace( '2 Pet','2Pet' ) \ #.replace( '1 J','1J' ).replace( '2 J','2J' ).replace( '3 J','3J' ) #B_CV_Bits = pointer.split( ' ', 1 ) #if len(B_CV_Bits) == 2 and ':' in B_CV_Bits[1]: #bookCodeText, CVString = B_CV_Bits #chapterNumberString, verseNumberString = CVString.split( ':' ) #chapterNumber = int( chapterNumberString ) #verseNumber = int( verseNumberString ) #if bookCodeText != lastBookCodeText: # We've started a new book #if bookCodeText in ('Ge',): BBB = 'GEN' #elif bookCodeText in ('Le',): BBB = 'LEV' #elif bookCodeText in ('La',): BBB = 'LAM' #else: ##vPrint( 'Quiet', debuggingThisModule, "4bookCodeText =", repr(bookCodeText) ) ##BBB = BOS.getBBBFromText( bookCodeText ) # Try to guess #BBB = BOS66.getBBBFromText( bookCodeText ) # Try to guess #if not BBB: BBB = BOS81.getBBBFromText( bookCodeText ) # Try to guess #if not BBB: BBB = BOSx.getBBBFromText( bookCodeText ) # Try to guess ##vPrint( 'Quiet', debuggingThisModule, "4BBB =", repr(BBB) ) #else: vPrint( 'Quiet', debuggingThisModule, "Unexpected number of bits", self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString, len(bits), bits ) #continue # Just save the pointer information which refers to the text on the next line #else: # it's not a $$ line #text = line ##vPrint( 'Quiet', debuggingThisModule, "text", repr(text) ) #if metadataName: #metadataContents += ('\n' if metadataContents else '') + text #continue #else: #vText = text ## Handle bits like (<scripref>Pr 2:7</scripref>) #vText = vText.replace( '(<scripref>', '\\x - \\xt ' ).replace( '</scripref>)', '\\x*' ) #vText = vText.replace( '<scripref>', '\\x - \\xt ' ).replace( '</scripref>', '\\x*' ) ##if '\\' in vText: vPrint( 'Quiet', debuggingThisModule, 'VPL vText', repr(vText) ) #if vplType == 4: # Forge for SwordSearcher ##vPrint( 'Quiet', debuggingThisModule, BBB, chapterNumber, verseNumber, repr(vText) ) ## Convert {stuff} to footnotes #match = re.search( '\\{(.+?)\\}', vText ) #while match: #footnoteText = '\\f + \\fr {}:{} \\ft {}\\f*'.format( chapterNumber, verseNumber, match.group(1) ) #vText = vText[:match.start()] + footnoteText + vText[match.end():] # Replace this footnote ##vPrint( 'Quiet', debuggingThisModule, BBB, chapterNumber, verseNumber, repr(vText) ) #match = re.search( '\\{(.+?)\\}', vText ) ## Convert [stuff] to added fields #match = re.search( '\\[(.+?)\\]', vText ) #while match: #addText = '\\add {}\\add*'.format( match.group(1) ) #vText = vText[:match.start()] + addText + vText[match.end():] # Replace this chunk ##vPrint( 'Quiet', debuggingThisModule, BBB, chapterNumber, verseNumber, repr(vText) ) #match = re.search( '\\[(.+?)\\]', vText ) #for badChar in '{}[]': #if badChar in vText: #logging.warning( "Found remaining braces or brackets in SwordSearcher Forge VPL {} {}:{} {!r}".format( BBB, chapterNumberString, verseNumberString, vText ) ) #break else: logging.critical( 'Unknown VPL type {}'.format( vplType ) ) if BibleOrgSysGlobals.debugFlag and debuggingThisModule: halt if bookCodeText: if bookCodeText != lastBookCodeText: # We've started a new book if lastBookCodeText is not None: # Better save the last book self.stashBook( thisBook ) if BBB: if BBB in self: logging.critical( "Have duplicated {} book in {}".format( self.givenName, BBB ) ) if BibleOrgSysGlobals.debugFlag: assert BBB not in self thisBook = BibleBook( self, BBB ) thisBook.objectNameString = 'VPL Bible Book object' thisBook.objectTypeString = 'VPL' verseList = BOSx.getNumVersesList( BBB ) numChapters, numVerses = len(verseList), verseList[0] lastBookCodeText = bookCodeText lastChapterNumber = lastVerseNumber = -1 else: logging.critical( "VPLBible{} could not figure out {!r} book code".format( vplType, bookCodeText ) ) if BibleOrgSysGlobals.debugFlag: halt if BBB: if chapterNumber != lastChapterNumber: # We've started a new chapter if BibleOrgSysGlobals.debugFlag: assert chapterNumber > lastChapterNumber or BBB=='ESG' # Esther Greek might be an exception if chapterNumber == 0: logging.info( "Have chapter zero in {} {} {} {}:{}".format( self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString ) ) elif chapterNumber > numChapters: logging.error( "Have high chapter number in {} {} {} {}:{} (expected max of {})".format( self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString, numChapters ) ) thisBook.addLine( 'c', chapterNumberString ) lastChapterNumber = chapterNumber lastVerseNumber = -1 # Handle the verse info if verseNumber==lastVerseNumber and vText==lastVText: logging.warning( _("Ignored duplicate verse line in {} {} {} {}:{}").format( self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString ) ) continue if verseNumber < lastVerseNumber: logging.warning( _("Ignored receding verse number (from {} to {}) in {} {} {} {}:{}").format( lastVerseNumber, verseNumber, self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString ) ) elif verseNumber == lastVerseNumber: if vText == lastVText: logging.warning( _("Ignored duplicated {} verse in {} {} {} {}:{}").format( verseNumber, self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString ) ) else: logging.warning( _("Ignored duplicated {} verse number in {} {} {} {}:{}").format( verseNumber, self.givenName, BBB, bookCodeText, chapterNumberString, verseNumberString ) ) # Check for paragraph markers if vText and vText[0]=='¶': thisBook.addLine( 'p', '' ) vText = vText[1:].lstrip() #vPrint( 'Quiet', debuggingThisModule, '{} {}:{} = {!r}'.format( BBB, chapterNumberString, verseNumberString, vText ) ) thisBook.addLine( 'v', verseNumberString + ' ' + vText ) lastVText = vText lastVerseNumber = verseNumber else: # No bookCodeText yet logging.warning( "VPLBible.load{} is skipping unknown pre-book line: {}".format( vplType, line ) ) # Save the final book if thisBook is not None: self.stashBook( thisBook ) # Clean up if settingsDict: #vPrint( 'Quiet', debuggingThisModule, "VPL settingsDict", settingsDict ) if self.suppliedMetadata is None: self.suppliedMetadata = {} self.suppliedMetadata['VPL'] = settingsDict self.applySuppliedMetadata( 'VPL' ) # Copy some to self.settingsDict self.doPostLoadProcessing()
def load( self ): """ Load a single source file and load book elements. """ vPrint( 'Info', debuggingThisModule, _("Loading {}…").format( self.sourceFilepath ) ) lastLine, lineCount = '', 0 BBB = None lastBookNumber = lastChapterNumber = lastVerseNumber = -1 lastVText = '' quoted = None with open( self.sourceFilepath, encoding=self.encoding ) as myFile: # Automatically closes the file when done for line in myFile: lineCount += 1 #if lineCount==1 and self.encoding.lower()=='utf-8' and line[0]==chr(65279): #U+FEFF #logging.info( " CSVBible.load: Detected Unicode Byte Order Marker (BOM)" ) #line = line[1:] # Remove the Unicode Byte Order Marker (BOM) if line and line[-1]=='\n': line=line[:-1] # Removing trailing newline character if not line: continue # Just discard blank lines if line==' ': continue # Handle special case which has blanks on every second line -- HACK lastLine = line #vPrint( 'Quiet', debuggingThisModule, "CSV file line {} is {}".format( lineCount, repr(line) ) ) if line[0]=='#': continue # Just discard comment lines if lineCount==1: if line.startswith( '"Book",' ): quoted = True continue # Just discard header line elif line.startswith( 'Book,' ): quoted = False continue # Just discard header line bits = line.split( ',', 3 ) #vPrint( 'Quiet', debuggingThisModule, lineCount, self.givenName, BBB, bits ) if len(bits) == 4: bString, chapterNumberString, verseNumberString, vText = bits #vPrint( 'Quiet', debuggingThisModule, "bString, chapterNumberString, verseNumberString, vText", bString, chapterNumberString, verseNumberString, vText ) else: logging.critical( "Unexpected number of bits {} {} {} {}:{} {!r} {} {}".format( self.givenName, BBB, bString, chapterNumberString, verseNumberString, vText, len(bits), bits ) ) # Remove quote marks from these strings if quoted: if len(bString)>=2 and bString[0]==bString[-1] and bString[0] in '"\'': bString = bString[1:-1] if len(chapterNumberString)>=2 and chapterNumberString[0]==chapterNumberString[-1] and chapterNumberString[0] in '"\'': chapterNumberString = chapterNumberString[1:-1] if len(verseNumberString)>=2 and verseNumberString[0]==verseNumberString[-1] and verseNumberString[0] in '"\'': verseNumberString = verseNumberString[1:-1] if len(vText)>=2 and vText[0]==vText[-1] and vText[0] in '"\'': vText = vText[1:-1] #vPrint( 'Quiet', debuggingThisModule, "bString, chapterNumberString, verseNumberString, vText", bString, chapterNumberString, verseNumberString, vText ) #if not bookCode and not chapterNumberString and not verseNumberString: #vPrint( 'Quiet', debuggingThisModule, "Skipping empty line in {} {} {} {}:{}".format( self.givenName, BBB, bookCode, chapterNumberString, verseNumberString ) ) #continue #if BibleOrgSysGlobals.debugFlag: assert 2 <= len(bookCode) <= 4 #if BibleOrgSysGlobals.debugFlag: assert chapterNumberString.isdigit() #if BibleOrgSysGlobals.debugFlag: assert verseNumberString.isdigit() bookNumber = int( bString ) chapterNumber = int( chapterNumberString ) verseNumber = int( verseNumberString ) if bookNumber != lastBookNumber: # We've started a new book if lastBookNumber != -1: # Better save the last book self.stashBook( thisBook ) BBB = BibleOrgSysGlobals.loadedBibleBooksCodes.getBBBFromReferenceNumber( bookNumber ) # Try to guess assert BBB thisBook = BibleBook( self, BBB ) thisBook.objectNameString = 'CSV Bible Book object' thisBook.objectTypeString = 'CSV' lastBookNumber = bookNumber lastChapterNumber = lastVerseNumber = -1 if chapterNumber != lastChapterNumber: # We've started a new chapter if BibleOrgSysGlobals.debugFlag: assert chapterNumber > lastChapterNumber or BBB=='ESG' # Esther Greek might be an exception if chapterNumber == 0: logging.info( "Have chapter zero in {} {} {} {}:{}".format( self.givenName, BBB, bookNumber, chapterNumberString, verseNumberString ) ) thisBook.addLine( 'c', chapterNumberString ) lastChapterNumber = chapterNumber lastVerseNumber = -1 # Now we have to convert any possible RTF codes to our internal codes vTextOriginal = vText # First do special characters vText = vText.replace( '\\ldblquote', '“' ).replace( '\\rdblquote', '”' ).replace( '\\lquote', '‘' ).replace( '\\rquote', '’' ) vText = vText.replace( '\\emdash', '—' ).replace( '\\endash', '–' ) # Now do Unicode characters while True: # Find patterns like \\'d3 match = re.search( r"\\'[0-9a-f][0-9a-f]", vText ) if not match: break i = int( vText[match.start()+2:match.end()], 16 ) # Convert two hex characters to decimal vText = vText[:match.start()] + chr( i ) + vText[match.end():] while True: # Find patterns like \\u253? match = re.search( r"\\u[1-2][0-9][0-9]\?", vText ) if not match: break i = int( vText[match.start()+2:match.end()-1] ) # Convert three digits to decimal vText = vText[:match.start()] + chr( i ) + vText[match.end():] #if vText != vTextOriginal: vPrint( 'Quiet', debuggingThisModule, repr(vTextOriginal) ); vPrint( 'Quiet', debuggingThisModule, repr(vText) ) ## Handle special formatting ## [brackets] are for Italicized words ## <brackets> are for the Words of Christ in Red ## «brackets» are for the Titles in the Book of Psalms. #vText = vText.replace( '[', '\\add ' ).replace( ']', '\\add*' ) \ #.replace( '<', '\\wj ' ).replace( '>', '\\wj*' ) #if vText and vText[0]=='«': #assert BBB=='PSA' and verseNumberString=='1' #vBits = vText[1:].split( '»' ) ##vPrint( 'Quiet', debuggingThisModule, "vBits", vBits ) #thisBook.addLine( 'd', vBits[0] ) # Psalm title #vText = vBits[1].lstrip() # Handle the verse info if verseNumber==lastVerseNumber and vText==lastVText: logging.warning( _("Ignored duplicate verse line in {} {} {} {}:{}").format( self.givenName, BBB, bookNumber, chapterNumberString, verseNumberString ) ) continue if BBB=='PSA' and verseNumberString=='1' and vText.startswith('<') and self.givenName=='basic_english': # Move Psalm titles to verse zero verseNumber = 0 if verseNumber < lastVerseNumber: logging.warning( _("Ignored receding verse number (from {} to {}) in {} {} {} {}:{}").format( lastVerseNumber, verseNumber, self.givenName, BBB, bookNumber, chapterNumberString, verseNumberString ) ) elif verseNumber == lastVerseNumber: if vText == lastVText: logging.warning( _("Ignored duplicated {} verse in {} {} {} {}:{}").format( verseNumber, self.givenName, BBB, bookNumber, chapterNumberString, verseNumberString ) ) else: logging.warning( _("Ignored duplicated {} verse number in {} {} {} {}:{}").format( verseNumber, self.givenName, BBB, bookNumber, chapterNumberString, verseNumberString ) ) thisBook.addLine( 'v', verseNumberString + ' ' + vText ) lastVText = vText lastVerseNumber = verseNumber # Save the final book self.stashBook( thisBook ) self.doPostLoadProcessing()
def __validateAndExtractBook(self, book): """ Check/validate and extract book data from the given XML book record finding chapter subelements. """ global BibleBooksNames vPrint('Verbose', debuggingThisModule, _("Validating OpenSong XML book…")) # Process the div attributes first BBB = bookName = None for attrib, value in book.items(): if attrib == "n": bookName = value else: logging.warning( "Unprocessed {!r} attribute ({}) in book element".format( attrib, value)) if bookName: BBB = self.genericBOS.getBBBFromText( bookName) # Booknames are usually in English if not BBB: # wasn't English if BibleBooksNames is None: BibleBooksNames = BibleBooksNamesSystems().loadData() BBB = BibleBooksNames.getBBBFromText( bookName) # Try non-English booknames #dPrint( 'Quiet', debuggingThisModule, "bookName", bookName, BBB ) if BBB: vPrint('Info', debuggingThisModule, _("Validating {} {}…").format(BBB, bookName)) thisBook = BibleBook(self, BBB) thisBook.objectNameString = 'OpenSong XML Bible Book object' thisBook.objectTypeString = 'OpenSong' #thisBook.sourceFilepath = self.sourceFilepath USFMAbbreviation = BibleOrgSysGlobals.loadedBibleBooksCodes.getUSFMAbbreviation( BBB) if not USFMAbbreviation: logging.critical( f"Unable to find USFM abbreviation for '{BBB}'") if BibleOrgSysGlobals.strictCheckingFlag: halt USFMAbbreviation = 'XXA' thisBook.addLine( 'id', '{} imported by {}'.format(USFMAbbreviation.upper(), programNameVersion)) thisBook.addLine('h', bookName) thisBook.addLine('mt1', bookName) for element in book: if element.tag == OpenSongXMLBible.chapterTag: sublocation = "chapter in {}".format(BBB) BibleOrgSysGlobals.checkXMLNoText( element, sublocation, 'j3jd') BibleOrgSysGlobals.checkXMLNoTail( element, sublocation, 'al1d') self.__validateAndExtractChapter( BBB, thisBook, element) else: logging.error( "Expected to find {!r} but got {!r}".format( OpenSongXMLBible.chapterTag, element.tag)) vPrint('Info', debuggingThisModule, " Saving {} into results…".format(BBB)) self.stashBook(thisBook) else: logging.error( _("OpenSong load doesn't recognize book name: {!r}"). format(bookName)) # no BBB else: logging.error( _("OpenSong load can't find a book name")) # no bookName
def load(self): """ Load a single source file and load book elements. """ vPrint('Info', debuggingThisModule, _("Loading {}…").format(self.sourceFilepath)) def decodeVerse(encodedVerseString): """ Decodes the verse which has @ format codes. """ verseString = encodedVerseString if verseString.startswith( '@@'): # This simply means that encoding follows verseString = verseString[2:] if verseString.startswith( '@@'): # This simply means that encoding follows verseString = verseString[2:] # Paragraph markers (marked now with double backslash) verseString = verseString.replace('@^', '\\\\p ') verseString = verseString.replace('@0', '\\\\m ') verseString = verseString.replace('@1', '\\\\q1 ').replace( '@2', '\\\\q2 ').replace('@3', '\\\\q3 ').replace('@4', '\\q4 ') verseString = verseString.replace('@8', '\\\\m ') # Character markers (marked now with single backslash) verseString = verseString.replace('@6', '\\wj ').replace('@5', '\\wj*') verseString = verseString.replace('@9', '\\add ').replace( '@7', '\\add*') # or \\i ??? verseString = re.sub(r'@<f([0-9])@>@/', r'\\ff\1', verseString) verseString = re.sub(r'@<x([0-9])@>@/', r'\\xx\1', verseString) #dPrint( 'Quiet', debuggingThisModule, repr( verseString ) ) assert '@' not in verseString return verseString # end of decodeVerse # Read all the lines into bookDict lastLine, lineCount = '', 0 bookNameDict, bookDict, footnoteDict, xrefDict, headingDict = {}, {}, {}, {}, {} BBB = bookNumberString = chapterNumberString = verseNumberString = encodedVerseString = '' lastBBB = lastBookNumberString = lastChapterNumberString = lastVerseNumberString = None with open(self.sourceFilepath, encoding=self.encoding ) as myFile: # Automatically closes the file when done for line in myFile: lineCount += 1 #if lineCount==1 and self.encoding.lower()=='utf-8' and line[0]==chr(65279): #U+FEFF #logging.info( " YETBible.load: Detected Unicode Byte Order Marker (BOM)" ) #line = line[1:] # Remove the Unicode Byte Order Marker (BOM) if line and line[-1] == '\n': line = line[:-1] # Removing trailing newline character if not line: continue # Just discard blank lines lastLine = line #dPrint( 'Quiet', debuggingThisModule, 'YETBible file line is "' + line + '"' ) bits = line.split('\t') #dPrint( 'Quiet', debuggingThisModule, self.givenName, BBB, bits ) if bits[0] == 'info': assert len(bits) == 3 if bits[1] == 'shortName': shortName = bits[2] self.name = shortName elif bits[1] == 'longName': longName = bits[2] elif bits[1] == 'description': description = bits[2] elif bits[1] == 'locale': locale = bits[2] assert 2 <= len(locale) <= 3 if locale == 'in': locale = 'id' # Fix a quirk in the locale encoding else: logging.warning( _("YETBible: unknown {} info field in {} {} {}:{}") \ .format( repr(bits[1]), BBB, chapterNumberString, verseNumberString ) ) continue elif bits[0] == 'book_name': assert 3 <= len(bits) <= 4 thisBBB = BibleOrgSysGlobals.loadedBibleBooksCodes.getBBBFromReferenceNumber( bits[1]) if len(bits) == 3: bookNameDict[thisBBB] = bits[2], '' elif len(bits) == 4: bookNameDict[thisBBB] = bits[2], bits[3] continue elif bits[0] == 'verse': assert len(bits) == 5 bookNumberString, chapterNumberString, verseNumberString, encodedVerseString = bits[ 1:] if BibleOrgSysGlobals.debugFlag: assert bookNumberString.isdigit() assert chapterNumberString.isdigit() assert verseNumberString.isdigit() BBB = BibleOrgSysGlobals.loadedBibleBooksCodes.getBBBFromReferenceNumber( bookNumberString) #dPrint( 'Quiet', debuggingThisModule, "{} {}:{} = {}".format( BBB, chapterNumberString, verseNumberString, repr(encodedVerseString) ) ) if BBB != lastBBB: # We have a new book if lastBBB is not None: # We have a completed book to save bookDict[lastBBB] = bookLines assert BBB in bookNameDict bookLines = {} # Keys are (C,V) strings verseString = decodeVerse(encodedVerseString) bookLines[(chapterNumberString, verseNumberString )] = verseString # Just store it for now lastBBB = BBB continue elif bits[0] == 'pericope': assert len(bits) == 5 bookNumberString, chapterNumberString, verseNumberString, encodedHeadingString = bits[ 1:] if BibleOrgSysGlobals.debugFlag: assert bookNumberString.isdigit() assert chapterNumberString.isdigit() assert verseNumberString.isdigit() BBB = BibleOrgSysGlobals.loadedBibleBooksCodes.getBBBFromReferenceNumber( bookNumberString) headingString = encodedHeadingString.replace( '@9', '\\it ').replace('@7', '\\it*') #dPrint( 'Quiet', debuggingThisModule, repr(encodedHeadingString), repr(headingString) ) assert '@' not in headingString headingDict[(BBB, chapterNumberString, verseNumberString)] = headingString, [ ] # Blank refList continue elif bits[ 0] == 'parallel': # These lines optionally follow pericope lines assert len(bits) == 2 heading, refList = headingDict[(BBB, chapterNumberString, verseNumberString)] refList.append(bits[1]) #dPrint( 'Quiet', debuggingThisModule, "parallel2", repr(heading), refList ) headingDict[(BBB, chapterNumberString, verseNumberString)] = heading, refList continue elif bits[0] == 'xref': assert len(bits) == 6 bookNumberString, chapterNumberString, verseNumberString, indexNumberString, encodedNoteString = bits[ 1:] if BibleOrgSysGlobals.debugFlag: assert bookNumberString.isdigit() assert chapterNumberString.isdigit() assert verseNumberString.isdigit() assert indexNumberString.isdigit() BBB = BibleOrgSysGlobals.loadedBibleBooksCodes.getBBBFromReferenceNumber( bookNumberString) noteString = encodedNoteString.replace('@9', '\\it ').replace( '@7', '\\it*') noteString = re.sub( r'@<ta(.+?)@>', r'', noteString ) # Get rid of these encoded BCV references for now noteString = re.sub( r'@<to(.+?)@>', r'', noteString ) # Get rid of these OSIS BCV references for now noteString = noteString.replace('@/', '') #dPrint( 'Quiet', debuggingThisModule, repr(encodedNoteString), repr(noteString) ) assert '@' not in noteString xrefDict[(BBB, chapterNumberString, verseNumberString, indexNumberString)] = noteString continue elif bits[0] == 'footnote': assert len(bits) == 6 bookNumberString, chapterNumberString, verseNumberString, indexNumberString, encodedNoteString = bits[ 1:] if BibleOrgSysGlobals.debugFlag: assert bookNumberString.isdigit() assert chapterNumberString.isdigit() assert verseNumberString.isdigit() assert indexNumberString.isdigit() BBB = BibleOrgSysGlobals.loadedBibleBooksCodes.getBBBFromReferenceNumber( bookNumberString) noteString = encodedNoteString.replace('@9', '\\it ').replace( '@7', '\\it*') assert '@' not in noteString footnoteDict[(BBB, chapterNumberString, verseNumberString, indexNumberString)] = noteString continue else: vPrint('Quiet', debuggingThisModule, "YETBible: Unknown line type", self.givenName, BBB, chapterNumberString, verseNumberString, len(bits), bits) halt bookDict[lastBBB] = bookLines # Save the last book # Now process the books for BBB, bkData in bookDict.items(): #dPrint( 'Quiet', debuggingThisModule, "Processing", BBB ) thisBook = BibleBook(self, BBB) thisBook.objectNameString = 'YET Bible Book object' thisBook.objectTypeString = 'YET' lastChapterNumberString = None for (chapterNumberString, verseNumberString), verseString in bkData.items(): # Insert headings (can only occur before verses) if (BBB, chapterNumberString, verseNumberString) in headingDict: heading, refList = headingDict[(BBB, chapterNumberString, verseNumberString)] #dPrint( 'Quiet', debuggingThisModule, 's', BBB, chapterNumberString, verseNumberString, repr(heading), refList ) thisBook.addLine('s', heading) if refList: refString = "" #dPrint( 'Quiet', debuggingThisModule, 's', BBB, chapterNumberString, verseNumberString, repr(heading), refList ) for ref in refList: refString += ('; ' if refString else '') + ref #dPrint( 'Quiet', debuggingThisModule, 's', BBB, chapterNumberString, verseNumberString, repr(heading), refList, repr(refString) ) thisBook.addLine('r', '(' + refString + ')') # Insert footnotes and cross-references while '\\ff' in verseString: #dPrint( 'Quiet', debuggingThisModule, "footnote", repr(verseString) ) fIx = verseString.index('\\ff') caller = verseString[fIx + 3] #dPrint( 'Quiet', debuggingThisModule, "fcaller", repr(caller) ) assert caller.isdigit() note = footnoteDict[(BBB, chapterNumberString, verseNumberString, caller)] #dPrint( 'Quiet', debuggingThisModule, "fnote", repr(note) ) verseString = verseString[: fIx] + '\\f + \\ft ' + note + '\\f*' + verseString[ fIx + 4:] #dPrint( 'Quiet', debuggingThisModule, "fvS", repr(verseString) ) while '\\xx' in verseString: #dPrint( 'Quiet', debuggingThisModule, "xref", repr(verseString) ) fIx = verseString.index('\\xx') caller = verseString[fIx + 3] #dPrint( 'Quiet', debuggingThisModule, "xcaller", repr(caller) ) assert caller.isdigit() note = xrefDict[(BBB, chapterNumberString, verseNumberString, caller)] #dPrint( 'Quiet', debuggingThisModule, "xnote", repr(note) ) verseString = verseString[: fIx] + '\\x - \\xt ' + note + '\\x*' + verseString[ fIx + 4:] #dPrint( 'Quiet', debuggingThisModule, "xvS", repr(verseString) ) # Save the Bible data fields if chapterNumberString != lastChapterNumberString: thisBook.addLine('c', chapterNumberString) lastChapterNumberString = chapterNumberString #dPrint( 'Quiet', debuggingThisModule, BBB, chapterNumberString, verseNumberString, repr(verseString) ) if verseString.startswith( '\\\\'): # It's an initial paragraph marker if verseString[3] == ' ': marker, verseString = verseString[2], verseString[4:] elif verseString[4] == ' ': marker, verseString = verseString[2:4], verseString[5:] else: halt #dPrint( 'Quiet', debuggingThisModule, '', '\\'+marker ) thisBook.addLine(marker, '') assert not verseString.startswith('\\\\') bits = verseString.split( '\\\\' ) # Split on paragraph markers (but not character markers) for j, bit in enumerate(bits): #dPrint( 'Quiet', debuggingThisModule, "loop", j, repr(bit), repr(verseString) ) if j == 0: thisBook.addLine( 'v', verseNumberString + ' ' + verseString.rstrip()) else: if bit[1] == ' ': marker, bit = bit[0], bit[2:] elif bit[2] == ' ': marker, bit = bit[0:2], bit[3:] else: halt #dPrint( 'Quiet', debuggingThisModule, "mV", marker, repr(bit), repr(verseString) ) thisBook.addLine(marker, bit.rstrip()) self.stashBook(thisBook) self.doPostLoadProcessing()
def load(self): """ Load a single source file and load book elements. """ vPrint('Info', debuggingThisModule, _("Loading {}…").format(self.sourceFilepath)) status = 0 # 1 = getting chapters, 2 = getting verse data lastLine, lineCount = '', 0 BBB = lastBBB = None bookDetails = {} with open(self.sourceFilepath, encoding=self.encoding ) as myFile: # Automatically closes the file when done for line in myFile: lineCount += 1 if lineCount == 1: if line[0] == chr(65279): #U+FEFF logging.info( "DrupalBible.load1: Detected Unicode Byte Order Marker (BOM) in {}" .format(self.sourceFilepath)) line = line[ 1:] # Remove the UTF-16 Unicode Byte Order Marker (BOM) elif line[:3] == '': # 0xEF,0xBB,0xBF logging.info( "DrupalBible.load2: Detected Unicode Byte Order Marker (BOM) in {}" .format(self.sourceFilepath)) line = line[ 3:] # Remove the UTF-8 Unicode Byte Order Marker (BOM) if line and line[-1] == '\n': line = line[:-1] # Removing trailing newline character if not line: continue # Just discard blank lines #dPrint( 'Quiet', debuggingThisModule, 'DB file line is "' + line + '"' ) if line[0] == '#': continue # Just discard comment lines lastLine = line if lineCount == 1: if line != '*Bible': logging.warning( "Unknown DrupalBible first line: {}".format( repr(line))) elif status == 0: if line == '*Chapter': status = 1 else: # Get the version name details bits = line.split('|') shortName, fullName, language = bits self.name = fullName elif status == 1: if line == '*Context': status = 2 else: # Get the book name details bits = line.split('|') bookCode, bookFullName, bookShortName, numChapters = bits assert bookShortName == bookCode BBBresult = BibleOrgSysGlobals.loadedBibleBooksCodes.getBBBFromDrupalBibleCode( bookCode) BBB = BBBresult if isinstance( BBBresult, str ) else BBBresult[ 0] # Result can be string or list of strings (best guess first) bookDetails[ BBB] = bookFullName, bookShortName, numChapters elif status == 2: # Get the verse text bits = line.split('|') bookCode, chapterNumberString, verseNumberString, lineMark, verseText = bits #chapterNumber, verseNumber = int( chapterNumberString ), int( verseNumberString ) if lineMark: vPrint('Quiet', debuggingThisModule, repr(lineMark)) halt BBBresult = BibleOrgSysGlobals.loadedBibleBooksCodes.getBBBFromDrupalBibleCode( bookCode) BBB = BBBresult if isinstance( BBBresult, str ) else BBBresult[ 0] # Result can be string or list of strings (best guess first) if BBB != lastBBB: if lastBBB is not None: self.stashBook(thisBook) thisBook = BibleBook(self, BBB) thisBook.objectNameString = 'DrupalBible Bible Book object' thisBook.objectTypeString = 'DrupalBible' lastChapterNumberString = None lastBBB = BBB if chapterNumberString != lastChapterNumberString: thisBook.addLine('c', chapterNumberString) lastChapterNumberString = chapterNumberString verseText = verseText.replace('<', '\\it ').replace( '>', '\\it*') thisBook.addLine('v', verseNumberString + ' ' + verseText) else: halt # Save the final book self.stashBook(thisBook) self.doPostLoadProcessing()