Пример #1
0
    def __init__(self, givenFolderName, givenName=None, encoding='utf-8'):
        """
        Create the internal USX Bible object.
        """
        # Setup and initialise the base class first
        Bible.__init__(self)
        self.objectNameString = "USX XML Bible object"
        self.objectTypeString = "USX"

        self.givenFolderName, self.givenName, self.encoding = givenFolderName, givenName, encoding  # Remember our parameters

        # Now we can set our object variables
        self.name = self.givenName
        if not self.name: self.name = os.path.basename(self.givenFolderName)
        if not self.name:
            self.name = os.path.basename(
                self.givenFolderName[:-1])  # Remove the final slash
        if not self.name: self.name = "USX Bible"

        # Do a preliminary check on the readability of our folder
        if not os.access(self.givenFolderName, os.R_OK):
            logging.error("USXXMLBible: File {!r} is unreadable".format(
                self.givenFolderName))

        # Find the filenames of all our books
        self.USXFilenamesObject = USXFilenames(self.givenFolderName)
        self.possibleFilenameDict = {}
        for BBB, filename in self.USXFilenamesObject.getConfirmedFilenames():
            self.possibleFilenameDict[BBB] = filename
Пример #2
0
    def preload(self):
        """
        Tries to determine USX filename pattern.
        """
        if BibleOrgSysGlobals.debugFlag or BibleOrgSysGlobals.verbosityLevel > 2:
            print("USXXMLBible preload() from {}".format(self.sourceFolder))

        # Do a preliminary check on the readability of our folder
        if not os.access(self.givenFolderName, os.R_OK):
            logging.error("USXXMLBible: File {!r} is unreadable".format(
                self.givenFolderName))

        # Find the filenames of all our books
        self.USXFilenamesObject = USXFilenames(self.givenFolderName)
        #print( "DDFSDF", self.USXFilenamesObject )
        #print( "DFSFGE", self.USXFilenamesObject.getPossibleFilenameTuples() )
        #print( "SDFSDQ", self.USXFilenamesObject.getConfirmedFilenameTuples() )
        self.possibleFilenameDict = OrderedDict()
        filenameTuples = self.USXFilenamesObject.getConfirmedFilenameTuples()
        if not filenameTuples:  # Try again
            filenameTuples = self.USXFilenamesObject.getPossibleFilenameTuples(
            )
        for BBB, filename in filenameTuples:
            self.availableBBBs.add(BBB)
            self.possibleFilenameDict[BBB] = filename
        #print( "GHJGHR", self.possibleFilenameDict ); halt

        self.preloadDone = True
Пример #3
0
    def __init__( self, givenFolderName, givenName=None, encoding='utf-8' ):
        """
        Create the internal USX Bible object.
        """
         # Setup and initialise the base class first
        Bible.__init__( self )
        self.objectNameString = "USX XML Bible object"
        self.objectTypeString = "USX"

        self.givenFolderName, self.givenName, self.encoding = givenFolderName, givenName, encoding # Remember our parameters

        # Now we can set our object variables
        self.name = self.givenName
        if not self.name: self.name = os.path.basename( self.givenFolderName )
        if not self.name: self.name = os.path.basename( self.givenFolderName[:-1] ) # Remove the final slash
        if not self.name: self.name = "USX Bible"

        # Do a preliminary check on the readability of our folder
        if not os.access( self.givenFolderName, os.R_OK ):
            logging.error( "USXXMLBible: File {!r} is unreadable".format( self.givenFolderName ) )

        # Find the filenames of all our books
        self.USXFilenamesObject = USXFilenames( self.givenFolderName )
        self.possibleFilenameDict = {}
        for BBB,filename in self.USXFilenamesObject.getConfirmedFilenames():
            self.possibleFilenameDict[BBB] = filename
Пример #4
0
    def preload( self ):
        """
        Tries to determine USX filename pattern.
        """
        if BibleOrgSysGlobals.debugFlag or BibleOrgSysGlobals.verbosityLevel > 2:
            print( exp("preload() from {}").format( self.sourceFolder ) )

        # Do a preliminary check on the readability of our folder
        if not os.access( self.givenFolderName, os.R_OK ):
            logging.error( "USXXMLBible: File {!r} is unreadable".format( self.givenFolderName ) )

        # Find the filenames of all our books
        self.USXFilenamesObject = USXFilenames( self.givenFolderName )
        #print( "DDFSDF", self.USXFilenamesObject )
        #print( "DFSFGE", self.USXFilenamesObject.getPossibleFilenameTuples() )
        #print( "SDFSDQ", self.USXFilenamesObject.getConfirmedFilenameTuples() )
        self.possibleFilenameDict = OrderedDict()
        filenameTuples = self.USXFilenamesObject.getConfirmedFilenameTuples()
        if not filenameTuples: # Try again
            filenameTuples = self.USXFilenamesObject.getPossibleFilenameTuples()
        for BBB,filename in filenameTuples:
            self.possibleFilenameDict[BBB] = filename
        #print( "GHJGHR", self.possibleFilenameDict ); halt

        if 0: # we don't have a getSSFFilenames function :(
            if self.suppliedMetadata is None: self.suppliedMetadata = {}
            if self.ssfFilepath is None: # it might have been loaded first
                # Attempt to load the SSF file
                #self.suppliedMetadata, self.settingsDict = {}, {}
                ssfFilepathList = self.USXFilenamesObject.getSSFFilenames( searchAbove=True, auto=True )
                #print( "ssfFilepathList", ssfFilepathList )
                if len(ssfFilepathList) > 1:
                    logging.error( exp("preload: Found multiple possible SSF files -- using first one: {}").format( ssfFilepathList ) )
                if len(ssfFilepathList) >= 1: # Seems we found the right one
                    PTXSettingsDict = loadPTX7ProjectData( self, ssfFilepathList[0] )
                    if PTXSettingsDict:
                        if 'PTX' not in self.suppliedMetadata: self.suppliedMetadata['PTX'] = {}
                        self.suppliedMetadata['PTX']['SSF'] = PTXSettingsDict
                        self.applySuppliedMetadata( 'SSF' ) # Copy some to BibleObject.settingsDict

        #self.name = self.givenName
        #if self.name is None:
            #for field in ('FullName','Name',):
                #if field in self.settingsDict: self.name = self.settingsDict[field]; break
        #if not self.name: self.name = os.path.basename( self.sourceFolder )
        #if not self.name: self.name = os.path.basename( self.sourceFolder[:-1] ) # Remove the final slash
        #if not self.name: self.name = "USFM Bible"

        self.preloadDone = True
Пример #5
0
def USXXMLBibleFileCheck(givenFolderName,
                         strictCheck=True,
                         autoLoad=False,
                         autoLoadBooks=False):
    """
    Given a folder, search for USX Bible files or folders in the folder and in the next level down.

    Returns False if an error is found.

    if autoLoad is false (default)
        returns None, or the number of Bibles found.

    if autoLoad is true and exactly one USX Bible is found,
        returns the loaded USXXMLBible object.
    """
    if BibleOrgSysGlobals.verbosityLevel > 2:
        print("USXXMLBibleFileCheck( {}, {}, {}, {} )".format(
            givenFolderName, strictCheck, autoLoad, autoLoadBooks))
    if BibleOrgSysGlobals.debugFlag:
        assert givenFolderName and isinstance(givenFolderName, str)
    if BibleOrgSysGlobals.debugFlag: assert autoLoad in (
            True,
            False,
    )

    # Check that the given folder is readable
    if not os.access(givenFolderName, os.R_OK):
        logging.critical(
            _("USXXMLBibleFileCheck: Given {!r} folder is unreadable").format(
                givenFolderName))
        return False
    if not os.path.isdir(givenFolderName):
        logging.critical(
            _("USXXMLBibleFileCheck: Given {!r} path is not a folder").format(
                givenFolderName))
        return False

    # Find all the files and folders in this folder
    if BibleOrgSysGlobals.verbosityLevel > 3:
        print(" USXXMLBibleFileCheck: Looking for files in given {}".format(
            givenFolderName))
    foundFolders, foundFiles = [], []
    for something in os.listdir(givenFolderName):
        somepath = os.path.join(givenFolderName, something)
        if os.path.isdir(somepath):
            if something in BibleOrgSysGlobals.COMMONLY_IGNORED_FOLDERS:
                continue  # don't visit these directories
            foundFolders.append(something)
        elif os.path.isfile(somepath):
            foundFiles.append(something)

    # See if there's an USXBible project here in this given folder
    numFound = 0
    UFns = USXFilenames(
        givenFolderName
    )  # Assuming they have standard Paratext style filenames
    if BibleOrgSysGlobals.verbosityLevel > 2: print(UFns)
    #filenameTuples = UFns.getPossibleFilenameTuples( strictCheck=True )
    #print( 'P', len(filenameTuples) )
    filenameTuples = UFns.getConfirmedFilenameTuples(strictCheck=True)
    #print( 'C', len(filenameTuples) )
    if BibleOrgSysGlobals.verbosityLevel > 3:
        print("Confirmed:", len(filenameTuples), filenameTuples)
    if BibleOrgSysGlobals.verbosityLevel > 2 and filenameTuples:
        print("  Found {} USX file{}.".format(
            len(filenameTuples), '' if len(filenameTuples) == 1 else 's'))
    if filenameTuples:
        numFound += 1
    if numFound:
        if BibleOrgSysGlobals.verbosityLevel > 2:
            print("USXXMLBibleFileCheck got", numFound, givenFolderName)
        if numFound == 1 and (autoLoad or autoLoadBooks):
            uB = USXXMLBible(givenFolderName)
            if autoLoad or autoLoadBooks:
                uB.preload()  # Determine the filenames
            if autoLoadBooks: uB.loadBooks()  # Load and process the book files
            return uB
        return numFound

    # Look one level down
    numFound = 0
    foundProjects = []
    for thisFolderName in sorted(foundFolders):
        tryFolderName = os.path.join(givenFolderName, thisFolderName + '/')
        if not os.access(tryFolderName,
                         os.R_OK):  # The subfolder is not readable
            logging.warning(
                _("USXXMLBibleFileCheck: {!r} subfolder is unreadable").format(
                    tryFolderName))
            continue
        if BibleOrgSysGlobals.verbosityLevel > 3:
            print("    USXXMLBibleFileCheck: Looking for files in {}".format(
                tryFolderName))
        foundSubfolders, foundSubfiles = [], []
        for something in os.listdir(tryFolderName):
            somepath = os.path.join(givenFolderName, thisFolderName, something)
            if os.path.isdir(somepath): foundSubfolders.append(something)
            elif os.path.isfile(somepath): foundSubfiles.append(something)

        # See if there's an USX Bible with standard Paratext style filenames here in this folder
        UFns = USXFilenames(
            tryFolderName
        )  # Assuming they have standard Paratext style filenames
        if BibleOrgSysGlobals.verbosityLevel > 2: print(UFns)
        #filenameTuples = UFns.getPossibleFilenameTuples()
        filenameTuples = UFns.getConfirmedFilenameTuples(strictCheck=True)
        if BibleOrgSysGlobals.verbosityLevel > 3:
            print("Confirmed:", len(filenameTuples), filenameTuples)
        if BibleOrgSysGlobals.verbosityLevel > 2 and filenameTuples:
            print("  Found {} USX files: {}".format(len(filenameTuples),
                                                    filenameTuples))
        elif BibleOrgSysGlobals.verbosityLevel > 1 and filenameTuples and debuggingThisModule:
            print("  Found {} USX file{}".format(
                len(filenameTuples), '' if len(filenameTuples) == 1 else 's'))
        if filenameTuples:
            foundProjects.append(tryFolderName)
            numFound += 1
    if numFound:
        if BibleOrgSysGlobals.verbosityLevel > 2:
            print("USXXMLBibleFileCheck foundProjects", numFound,
                  foundProjects)
        if numFound == 1 and (autoLoad or autoLoadBooks):
            uB = USXXMLBible(foundProjects[0])
            if autoLoad or autoLoadBooks:
                uB.preload()  # Determine the filenames
            if autoLoadBooks: uB.loadBooks()  # Load and process the book files
            return uB
        return numFound
Пример #6
0
class USXXMLBible(Bible):
    """
    Class to load and manipulate USX Bibles.

    """
    def __init__(self,
                 givenFolderName,
                 givenName=None,
                 givenAbbreviation=None,
                 encoding='utf-8'):
        """
        Create the internal USX Bible object.
        """
        # Setup and initialise the base class first
        Bible.__init__(self)
        self.objectNameString = 'USX XML Bible object'
        self.objectTypeString = 'USX'

        self.givenFolderName, self.givenName, self.abbreviation, self.encoding = givenFolderName, givenName, givenAbbreviation, encoding  # Remember our parameters
        self.sourceFolder = self.givenFolderName

        # Now we can set our object variables
        self.name = self.givenName
        if not self.name: self.name = os.path.basename(self.givenFolderName)
        if not self.name:
            self.name = os.path.basename(
                self.givenFolderName[:-1])  # Remove the final slash
        if not self.name: self.name = 'USX Bible'

    # end of USXXMLBible.__init_

    def preload(self):
        """
        Tries to determine USX filename pattern.
        """
        if BibleOrgSysGlobals.debugFlag or BibleOrgSysGlobals.verbosityLevel > 2:
            print("USXXMLBible preload() from {}".format(self.sourceFolder))

        # Do a preliminary check on the readability of our folder
        if not os.access(self.givenFolderName, os.R_OK):
            logging.error("USXXMLBible: File {!r} is unreadable".format(
                self.givenFolderName))

        # Find the filenames of all our books
        self.USXFilenamesObject = USXFilenames(self.givenFolderName)
        #print( "DDFSDF", self.USXFilenamesObject )
        #print( "DFSFGE", self.USXFilenamesObject.getPossibleFilenameTuples() )
        #print( "SDFSDQ", self.USXFilenamesObject.getConfirmedFilenameTuples() )
        self.possibleFilenameDict = OrderedDict()
        filenameTuples = self.USXFilenamesObject.getConfirmedFilenameTuples()
        if not filenameTuples:  # Try again
            filenameTuples = self.USXFilenamesObject.getPossibleFilenameTuples(
            )
        for BBB, filename in filenameTuples:
            self.availableBBBs.add(BBB)
            self.possibleFilenameDict[BBB] = filename
        #print( "GHJGHR", self.possibleFilenameDict ); halt

        self.preloadDone = True

    # end of USXXMLBible.preload

    def loadBook(self, BBB, filename=None):
        """
        NOTE: You should ensure that preload() has been called first.
        """
        if BibleOrgSysGlobals.debugFlag and debuggingThisModule:
            print("USXXMLBible.loadBook( {}, {} )".format(BBB, filename))
            assert self.preloadDone

        if BBB not in self.bookNeedsReloading or not self.bookNeedsReloading[
                BBB]:
            if BBB in self.books:
                if BibleOrgSysGlobals.debugFlag:
                    print("  {} is already loaded -- returning".format(BBB))
                return  # Already loaded
            if BBB in self.triedLoadingBook:
                logging.warning(
                    "We had already tried loading USX {} for {}".format(
                        BBB, self.name))
                return  # We've already attempted to load this book
        self.triedLoadingBook[BBB] = True

        if BibleOrgSysGlobals.verbosityLevel > 2 or BibleOrgSysGlobals.debugFlag:
            print(
                _("  USXXMLBible: Loading {} from {} from {}…").format(
                    BBB, self.name, self.sourceFolder))
        if filename is None: filename = self.possibleFilenameDict[BBB]
        UBB = USXXMLBibleBook(self, BBB)
        UBB.load(filename, self.givenFolderName, self.encoding)
        UBB.validateMarkers()
        #for j, something in enumerate( UBB._processedLines ):
        #print( j, something )
        #if j > 100: break
        #for j, something in enumerate( sorted(UBB._CVIndex) ):
        #print( j, something )
        #if j > 50: break
        #halt
        self.stashBook(UBB)
        self.bookNeedsReloading[BBB] = False

    # end of USXXMLBible.loadBook

    def _loadBookMP(self, BBB, filename=None):
        """
        Used for multiprocessing.

        NOTE: You should ensure that preload() has been called first.
        """
        if BibleOrgSysGlobals.debugFlag and debuggingThisModule:
            print("USXXMLBible._loadBookMP( {}, {} )".format(BBB, filename))
            assert self.preloadDone

        if BBB in self.books: return  # Already loaded
        if BBB in self.triedLoadingBook:
            logging.warning(
                "We had already tried loading USX {} for {}".format(
                    BBB, self.name))
            return  # We've already attempted to load this book
        self.triedLoadingBook[BBB] = True

        if BibleOrgSysGlobals.verbosityLevel > 2 or BibleOrgSysGlobals.debugFlag:
            print(
                _("  USXXMLBible: Loading {} from {} from {}…").format(
                    BBB, self.name, self.sourceFolder))
        if filename is None: filename = self.possibleFilenameDict[BBB]
        UBB = USXXMLBibleBook(self, BBB)
        UBB.load(filename, self.givenFolderName, self.encoding)
        UBB.validateMarkers()
        #for j, something in enumerate( UBB._processedLines ):
        #print( j, something )
        #if j > 100: break
        #for j, something in enumerate( sorted(UBB._CVIndex) ):
        #print( j, something )
        #if j > 50: break
        #halt
        return UBB

    # end of USXXMLBible._loadBookMP

    def loadBooks(self):
        """
        Load the books.
        """
        if BibleOrgSysGlobals.verbosityLevel > 1:
            print(
                _("USXXMLBible: Loading {} books from {}…").format(
                    self.name, self.givenFolderName))

        if not self.preloadDone: self.preload()

        # Do a preliminary check on the contents of our folder
        foundFiles, foundFolders = [], []
        for something in os.listdir(self.givenFolderName):
            somepath = os.path.join(self.givenFolderName, something)
            if os.path.isdir(somepath): foundFolders.append(something)
            elif os.path.isfile(somepath): foundFiles.append(something)
            else:
                logging.error("Not sure what {!r} is in {}!".format(
                    somepath, self.givenFolderName))
        if foundFolders:
            logging.info(
                "USXXMLBible.loadBooks: Surprised to see subfolders in {!r}: {}"
                .format(self.givenFolderName, foundFolders))
        if not foundFiles:
            if BibleOrgSysGlobals.verbosityLevel > 0:
                print("USXXMLBible.loadBooks: Couldn't find any files in {!r}".
                      format(self.givenFolderName))
            return  # No use continuing

        # Load the books one by one -- assuming that they have regular Paratext style filenames
        if BibleOrgSysGlobals.maxProcesses > 1 \
        and not BibleOrgSysGlobals.alreadyMultiprocessing: # Get our subprocesses ready and waiting for work
            # Load all the books as quickly as possible
            parameters = []
            for BBB, filename in self.USXFilenamesObject.getConfirmedFilenameTuples(
            ):
                parameters.append(BBB)
            #print( "parameters", parameters )
            if BibleOrgSysGlobals.verbosityLevel > 1:
                print(
                    _("Loading {} {} books using {} processes…").format(
                        len(parameters), 'USX',
                        BibleOrgSysGlobals.maxProcesses))
                print(
                    _("  NOTE: Outputs (including error and warning messages) from loading various books may be interspersed."
                      ))
            BibleOrgSysGlobals.alreadyMultiprocessing = True
            with multiprocessing.Pool(processes=BibleOrgSysGlobals.maxProcesses
                                      ) as pool:  # start worker processes
                results = pool.map(self._loadBookMP,
                                   parameters)  # have the pool do our loads
                #print( "results", results )
                #assert len(results) == len(parameters)
                for j, UBB in enumerate(results):
                    BBB = parameters[j]
                    #self.books[BBB] = UBB
                    self.stashBook(UBB)
                    # Make up our book name dictionaries while we're at it
                    assumedBookNames = UBB.getAssumedBookNames()
                    for assumedBookName in assumedBookNames:
                        self.BBBToNameDict[BBB] = assumedBookName
                        assumedBookNameLower = assumedBookName.lower()
                        self.bookNameDict[
                            assumedBookNameLower] = BBB  # Store the deduced book name (just lower case)
                        self.combinedBookNameDict[
                            assumedBookNameLower] = BBB  # Store the deduced book name (just lower case)
                        if ' ' in assumedBookNameLower:
                            self.combinedBookNameDict[
                                assumedBookNameLower.replace(
                                    ' ', ''
                                )] = BBB  # Store the deduced book name (lower case without spaces)
            BibleOrgSysGlobals.alreadyMultiprocessing = False
        else:  # Just single threaded
            #print( self.USXFilenamesObject.getConfirmedFilenameTuples() ); halt
            for BBB, filename in self.possibleFilenameDict.items():
                self.loadBook(BBB, filename)  # also saves it
                #UBB = USXXMLBibleBook( self, BBB )
                #UBB.load( filename, self.givenFolderName, self.encoding )
                #UBB.validateMarkers()
                #print( UBB )
                #self.stashBook( UBB )

        if not self.books:  # Didn't successfully load any regularly named books -- maybe the files have weird names??? -- try to be intelligent here
            if BibleOrgSysGlobals.verbosityLevel > 2:
                print(
                    "USXXMLBible.loadBooks: Didn't find any regularly named USX files in {!r}"
                    .format(self.givenFolderName))
            #for thisFilename in foundFiles:
            ## Look for BBB in the ID line (which should be the first line in a USX file)
            #isUSX = False
            #thisPath = os.path.join( self.givenFolderName, thisFilename )
            #try:
            #with open( thisPath ) as possibleUSXFile: # Automatically closes the file when done
            #for line in possibleUSXFile:
            #if line.startswith( '\\id ' ):
            #USXId = line[4:].strip()[:3] # Take the first three non-blank characters after the space after id
            #if BibleOrgSysGlobals.verbosityLevel > 2: print( "Have possible USX ID {!r}".format( USXId ) )
            #BBB = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromUSFMAbbreviation( USXId )
            #if BibleOrgSysGlobals.verbosityLevel > 2: print( "BBB is {!r}".format( BBB ) )
            #isUSX = True
            #break # We only look at the first line
            #except UnicodeDecodeError: isUSX = False
            #if isUSX:
            #UBB = USXXMLBibleBook( self, BBB )
            #UBB.load( self.givenFolderName, thisFilename, self.encoding )
            #UBB.validateMarkers()
            #print( UBB )
            #self.books[BBB] = UBB
            ## Make up our book name dictionaries while we're at it
            #assumedBookNames = UBB.getAssumedBookNames()
            #for assumedBookName in assumedBookNames:
            #self.BBBToNameDict[BBB] = assumedBookName
            #assumedBookNameLower = assumedBookName.lower()
            #self.bookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case)
            #self.combinedBookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case)
            #if ' ' in assumedBookNameLower: self.combinedBookNameDict[assumedBookNameLower.replace(' ','')] = BBB # Store the deduced book name (lower case without spaces)
            #if self.books: print( "USXXMLBible.loadBooks: Found {} irregularly named USX files".format( len(self.books) ) )

        self.doPostLoadProcessing()

    # end of USXXMLBible.loadBooks

    def load(self):
        self.loadBooks()
Пример #7
0
def USXXMLBibleFileCheck( givenFolderName, strictCheck=True, autoLoad=False, autoLoadBooks=False ):
    """
    Given a folder, search for USX Bible files or folders in the folder and in the next level down.

    Returns False if an error is found.

    if autoLoad is false (default)
        returns None, or the number of Bibles found.

    if autoLoad is true and exactly one USX Bible is found,
        returns the loaded USXXMLBible object.
    """
    if BibleOrgSysGlobals.verbosityLevel > 2: print( "USXXMLBibleFileCheck( {}, {}, {} )".format( givenFolderName, strictCheck, autoLoad ) )
    if BibleOrgSysGlobals.debugFlag: assert( givenFolderName and isinstance( givenFolderName, str ) )
    if BibleOrgSysGlobals.debugFlag: assert( autoLoad in (True,False,) )

    # Check that the given folder is readable
    if not os.access( givenFolderName, os.R_OK ):
        logging.critical( _("USXXMLBibleFileCheck: Given {!r} folder is unreadable").format( givenFolderName ) )
        return False
    if not os.path.isdir( givenFolderName ):
        logging.critical( _("USXXMLBibleFileCheck: Given {!r} path is not a folder").format( givenFolderName ) )
        return False

    # Find all the files and folders in this folder
    if BibleOrgSysGlobals.verbosityLevel > 3: print( " USXXMLBibleFileCheck: Looking for files in given {}".format( givenFolderName ) )
    foundFolders, foundFiles = [], []
    for something in os.listdir( givenFolderName ):
        somepath = os.path.join( givenFolderName, something )
        if os.path.isdir( somepath ): foundFolders.append( something )
        elif os.path.isfile( somepath ): foundFiles.append( something )
    if '__MACOSX' in foundFolders:
        foundFolders.remove( '__MACOSX' )  # don't visit these directories

    # See if there's an USXBible project here in this given folder
    numFound = 0
    UFns = USXFilenames( givenFolderName ) # Assuming they have standard Paratext style filenames
    if BibleOrgSysGlobals.verbosityLevel > 2: print( UFns )
    filenameTuples = UFns.getConfirmedFilenames()
    if BibleOrgSysGlobals.verbosityLevel > 3: print( "Confirmed:", len(filenameTuples), filenameTuples )
    if BibleOrgSysGlobals.verbosityLevel > 1 and filenameTuples: print( "  Found {} USX file{}.".format( len(filenameTuples), '' if len(filenameTuples)==1 else 's' ) )
    if filenameTuples:
        numFound += 1
    if numFound:
        if BibleOrgSysGlobals.verbosityLevel > 2: print( "USXXMLBibleFileCheck got", numFound, givenFolderName )
        if numFound == 1 and (autoLoad or autoLoadBooks):
            uB = USXXMLBible( givenFolderName )
            if autoLoadBooks: uB.load() # Load and process the file
            return uB
        return numFound

    # Look one level down
    numFound = 0
    foundProjects = []
    for thisFolderName in sorted( foundFolders ):
        tryFolderName = os.path.join( givenFolderName, thisFolderName+'/' )
        if not os.access( tryFolderName, os.R_OK ): # The subfolder is not readable
            logging.warning( _("USXXMLBibleFileCheck: {!r} subfolder is unreadable").format( tryFolderName ) )
            continue
        if BibleOrgSysGlobals.verbosityLevel > 3: print( "    USXXMLBibleFileCheck: Looking for files in {}".format( tryFolderName ) )
        foundSubfolders, foundSubfiles = [], []
        for something in os.listdir( tryFolderName ):
            somepath = os.path.join( givenFolderName, thisFolderName, something )
            if os.path.isdir( somepath ): foundSubfolders.append( something )
            elif os.path.isfile( somepath ): foundSubfiles.append( something )

        # See if there's an USX Bible here in this folder
        UFns = USXFilenames( tryFolderName ) # Assuming they have standard Paratext style filenames
        if BibleOrgSysGlobals.verbosityLevel > 2: print( UFns )
        filenameTuples = UFns.getConfirmedFilenames()
        if BibleOrgSysGlobals.verbosityLevel > 3: print( "Confirmed:", len(filenameTuples), filenameTuples )
        if BibleOrgSysGlobals.verbosityLevel > 2 and filenameTuples: print( "  Found {} USX files: {}".format( len(filenameTuples), filenameTuples ) )
        elif BibleOrgSysGlobals.verbosityLevel > 1 and filenameTuples: print( "  Found {} USX file{}".format( len(filenameTuples), '' if len(filenameTuples)==1 else 's' ) )
        if filenameTuples:
            foundProjects.append( tryFolderName )
            numFound += 1
    if numFound:
        if BibleOrgSysGlobals.verbosityLevel > 2: print( "USXXMLBibleFileCheck foundProjects", numFound, foundProjects )
        if numFound == 1 and (autoLoad or autoLoadBooks):
            uB = USXXMLBible( foundProjects[0] )
            if autoLoadBooks: uB.load() # Load and process the file
            return uB
        return numFound
Пример #8
0
class USXXMLBible( Bible ):
    """
    Class to load and manipulate USX Bibles.

    """
    def __init__( self, givenFolderName, givenName=None, encoding='utf-8' ):
        """
        Create the internal USX Bible object.
        """
         # Setup and initialise the base class first
        Bible.__init__( self )
        self.objectNameString = "USX XML Bible object"
        self.objectTypeString = "USX"

        self.givenFolderName, self.givenName, self.encoding = givenFolderName, givenName, encoding # Remember our parameters

        # Now we can set our object variables
        self.name = self.givenName
        if not self.name: self.name = os.path.basename( self.givenFolderName )
        if not self.name: self.name = os.path.basename( self.givenFolderName[:-1] ) # Remove the final slash
        if not self.name: self.name = "USX Bible"

        # Do a preliminary check on the readability of our folder
        if not os.access( self.givenFolderName, os.R_OK ):
            logging.error( "USXXMLBible: File {!r} is unreadable".format( self.givenFolderName ) )

        # Find the filenames of all our books
        self.USXFilenamesObject = USXFilenames( self.givenFolderName )
        self.possibleFilenameDict = {}
        for BBB,filename in self.USXFilenamesObject.getConfirmedFilenames():
            self.possibleFilenameDict[BBB] = filename
    # end of USXXMLBible.__init_


    def loadBook( self, BBB, filename=None ):
        """
        Used for multiprocessing.
        """
        if BibleOrgSysGlobals.verbosityLevel > 2: print( "USXXMLBible.loadBook( {}, {} )".format( BBB, filename ) )
        if BBB in self.books: return # Already loaded
        if BBB in self.triedLoadingBook:
            logging.warning( "We had already tried loading USX {} for {}".format( BBB, self.name ) )
            return # We've already attempted to load this book
        self.triedLoadingBook[BBB] = True
        if BibleOrgSysGlobals.verbosityLevel > 2 or BibleOrgSysGlobals.debugFlag: print( _("  USXXMLBible: Loading {} from {} from {}...").format( BBB, self.name, self.sourceFolder ) )
        if filename is None: filename = self.possibleFilenameDict[BBB]
        UBB = USXXMLBibleBook( self, BBB )
        UBB.load( filename, self.givenFolderName, self.encoding )
        UBB.validateMarkers()
        #for j, something in enumerate( UBB._processedLines ):
            #print( j, something )
            #if j > 100: break
        #for j, something in enumerate( sorted(UBB._CVIndex) ):
            #print( j, something )
            #if j > 50: break
        #halt
        self.saveBook( UBB )
        #return UBB
    # end of USXXMLBible.loadBook


    def load( self ):
        """
        Load the books.
        """
        def loadSSFData( ssfFilepath, encoding='utf-8' ):
            """Process the SSF data from the given filepath.
                Returns a dictionary."""
            if BibleOrgSysGlobals.verbosityLevel > 2: print( _("Loading SSF data from {!r}").format( ssfFilepath ) )
            lastLine, lineCount, status, settingsDict = '', 0, 0, {}
            with open( ssfFilepath, encoding=encoding ) as myFile: # Automatically closes the file when done
                for line in myFile:
                    lineCount += 1
                    if lineCount==1 and line and line[0]==chr(65279): #U+FEFF
                        logging.info( "USXXMLBible.load: Detected UTF-16 Byte Order Marker in {}".format( ssfFilepath ) )
                        line = line[1:] # Remove the Byte Order Marker
                    if line[-1]=='\n': line = line[:-1] # Remove trailing newline character
                    line = line.strip() # Remove leading and trailing whitespace
                    if not line: continue # Just discard blank lines
                    lastLine = line
                    processed = False
                    if status==0 and line=="<ScriptureText>":
                        status = 1
                        processed = True
                    elif status==1 and line=="</ScriptureText>":
                        status = 2
                        processed = True
                    elif status==1 and line[0]=='<' and line.endswith('/>'): # Handle a self-closing (empty) field
                        fieldname = line[1:-3] if line.endswith(' />') else line[1:-2] # Handle it with or without a space
                        if ' ' not in fieldname:
                            settingsDict[fieldname] = ''
                            processed = True
                        elif ' ' in fieldname: # Some fields (like "Naming") may contain attributes
                            bits = fieldname.split( None, 1 )
                            assert( len(bits)==2 )
                            fieldname = bits[0]
                            attributes = bits[1]
                            #print( "attributes = {!r}".format( attributes) )
                            settingsDict[fieldname] = (contents, attributes)
                            processed = True
                    elif status==1 and line[0]=='<' and line[-1]=='>':
                        ix1 = line.find('>')
                        ix2 = line.find('</')
                        if ix1!=-1 and ix2!=-1 and ix2>ix1:
                            fieldname = line[1:ix1]
                            contents = line[ix1+1:ix2]
                            if ' ' not in fieldname and line[ix2+2:-1]==fieldname:
                                settingsDict[fieldname] = contents
                                processed = True
                            elif ' ' in fieldname: # Some fields (like "Naming") may contain attributes
                                bits = fieldname.split( None, 1 )
                                assert( len(bits)==2 )
                                fieldname = bits[0]
                                attributes = bits[1]
                                #print( "attributes = {!r}".format( attributes) )
                                if line[ix2+2:-1]==fieldname:
                                    settingsDict[fieldname] = (contents, attributes)
                                    processed = True
                    if not processed: logging.error( "Unexpected {!r} line in SSF file".format( line ) )
            if BibleOrgSysGlobals.verbosityLevel > 2:
                print( "  " + _("Got {} SSF entries:").format( len(settingsDict) ) )
                if BibleOrgSysGlobals.verbosityLevel > 3:
                    for key in sorted(settingsDict):
                        print( "    {}: {}".format( key, settingsDict[key] ) )
            self.ssfDict = settingsDict # We'll keep a copy of just the SSF settings
            self.settingsDict = settingsDict.copy() # This will be all the combined settings
        # end of loadSSFData

        if BibleOrgSysGlobals.verbosityLevel > 1: print( _("USXXMLBible: Loading {} from {}...").format( self.name, self.givenFolderName ) )

        # Do a preliminary check on the contents of our folder
        foundFiles, foundFolders = [], []
        for something in os.listdir( self.givenFolderName ):
            somepath = os.path.join( self.givenFolderName, something )
            if os.path.isdir( somepath ): foundFolders.append( something )
            elif os.path.isfile( somepath ): foundFiles.append( something )
            else: logging.error( "Not sure what {!r} is in {}!".format( somepath, self.givenFolderName ) )
        if foundFolders: logging.info( "USXXMLBible.load: Surprised to see subfolders in {!r}: {}".format( self.givenFolderName, foundFolders ) )
        if not foundFiles:
            if BibleOrgSysGlobals.verbosityLevel > 0: print( "USXXMLBible.load: Couldn't find any files in {!r}".format( self.givenFolderName ) )
            return # No use continuing

        if 0: # We don't have a getSSFFilenames function
            # Attempt to load the metadata file
            ssfFilepathList = self.USXFilenamesObject.getSSFFilenames( searchAbove=True, auto=True )
            if len(ssfFilepathList) == 1: # Seems we found the right one
                loadSSFData( ssfFilepathList[0] )

        # Load the books one by one -- assuming that they have regular Paratext style filenames
        # DON'T KNOW WHY THIS DOESN'T WORK
        if 0 and BibleOrgSysGlobals.maxProcesses > 1: # Load all the books as quickly as possible
            parameters = []
            for BBB,filename in self.USXFilenamesObject.getConfirmedFilenames():
                parameters.append( BBB )
            #print( "parameters", parameters )
            with multiprocessing.Pool( processes=BibleOrgSysGlobals.maxProcesses ) as pool: # start worker processes
                results = pool.map( self.loadBook, parameters ) # have the pool do our loads
                print( "results", results )
                assert( len(results) == len(parameters) )
                for j, UBB in enumerate( results ):
                    BBB = parameters[j]
                    self.books[BBB] = UBB
                    # Make up our book name dictionaries while we're at it
                    assumedBookNames = UBB.getAssumedBookNames()
                    for assumedBookName in assumedBookNames:
                        self.BBBToNameDict[BBB] = assumedBookName
                        assumedBookNameLower = assumedBookName.lower()
                        self.bookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case)
                        self.combinedBookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case)
                        if ' ' in assumedBookNameLower: self.combinedBookNameDict[assumedBookNameLower.replace(' ','')] = BBB # Store the deduced book name (lower case without spaces)
        else: # Just single threaded
            for BBB,filename in self.USXFilenamesObject.getConfirmedFilenames():
                UBB = USXXMLBibleBook( self, BBB )
                UBB.load( filename, self.givenFolderName, self.encoding )
                UBB.validateMarkers()
                #print( UBB )
                self.saveBook( UBB )
                #self.books[BBB] = UBB
                ## Make up our book name dictionaries while we're at it
                #assumedBookNames = UBB.getAssumedBookNames()
                #for assumedBookName in assumedBookNames:
                    #self.BBBToNameDict[BBB] = assumedBookName
                    #assumedBookNameLower = assumedBookName.lower()
                    #self.bookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case)
                    #self.combinedBookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case)
                    #if ' ' in assumedBookNameLower: self.combinedBookNameDict[assumedBookNameLower.replace(' ','')] = BBB # Store the deduced book name (lower case without spaces)

        if not self.books: # Didn't successfully load any regularly named books -- maybe the files have weird names??? -- try to be intelligent here
            if BibleOrgSysGlobals.verbosityLevel > 2:
                print( "USXXMLBible.load: Didn't find any regularly named USX files in {!r}".format( self.givenFolderName ) )
            for thisFilename in foundFiles:
                # Look for BBB in the ID line (which should be the first line in a USX file)
                isUSX = False
                thisPath = os.path.join( self.givenFolderName, thisFilename )
                with open( thisPath ) as possibleUSXFile: # Automatically closes the file when done
                    for line in possibleUSXFile:
                        if line.startswith( '\\id ' ):
                            USXId = line[4:].strip()[:3] # Take the first three non-blank characters after the space after id
                            if BibleOrgSysGlobals.verbosityLevel > 2: print( "Have possible USX ID {!r}".format( USXId ) )
                            BBB = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromUSFM( USXId )
                            if BibleOrgSysGlobals.verbosityLevel > 2: print( "BBB is {!r}".format( BBB ) )
                            isUSX = True
                        break # We only look at the first line
                if isUSX:
                    UBB = USXXMLBibleBook( self, BBB )
                    UBB.load( self.givenFolderName, thisFilename, self.encoding )
                    UBB.validateMarkers()
                    print( UBB )
                    self.books[BBB] = UBB
                    # Make up our book name dictionaries while we're at it
                    assumedBookNames = UBB.getAssumedBookNames()
                    for assumedBookName in assumedBookNames:
                        self.BBBToNameDict[BBB] = assumedBookName
                        assumedBookNameLower = assumedBookName.lower()
                        self.bookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case)
                        self.combinedBookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case)
                        if ' ' in assumedBookNameLower: self.combinedBookNameDict[assumedBookNameLower.replace(' ','')] = BBB # Store the deduced book name (lower case without spaces)
            if self.books: print( "USXXMLBible.load: Found {} irregularly named USX files".format( len(self.books) ) )
        self.doPostLoadProcessing()
Пример #9
0
class USXXMLBible(Bible):
    """
    Class to load and manipulate USX Bibles.

    """
    def __init__(self, givenFolderName, givenName=None, encoding='utf-8'):
        """
        Create the internal USX Bible object.
        """
        # Setup and initialise the base class first
        Bible.__init__(self)
        self.objectNameString = "USX XML Bible object"
        self.objectTypeString = "USX"

        self.givenFolderName, self.givenName, self.encoding = givenFolderName, givenName, encoding  # Remember our parameters

        # Now we can set our object variables
        self.name = self.givenName
        if not self.name: self.name = os.path.basename(self.givenFolderName)
        if not self.name:
            self.name = os.path.basename(
                self.givenFolderName[:-1])  # Remove the final slash
        if not self.name: self.name = "USX Bible"

        # Do a preliminary check on the readability of our folder
        if not os.access(self.givenFolderName, os.R_OK):
            logging.error("USXXMLBible: File {!r} is unreadable".format(
                self.givenFolderName))

        # Find the filenames of all our books
        self.USXFilenamesObject = USXFilenames(self.givenFolderName)
        self.possibleFilenameDict = {}
        for BBB, filename in self.USXFilenamesObject.getConfirmedFilenames():
            self.possibleFilenameDict[BBB] = filename

    # end of USXXMLBible.__init_

    def loadBook(self, BBB, filename=None):
        """
        Used for multiprocessing.
        """
        if BibleOrgSysGlobals.verbosityLevel > 2:
            print("USXXMLBible.loadBook( {}, {} )".format(BBB, filename))
        if BBB in self.books: return  # Already loaded
        if BBB in self.triedLoadingBook:
            logging.warning(
                "We had already tried loading USX {} for {}".format(
                    BBB, self.name))
            return  # We've already attempted to load this book
        self.triedLoadingBook[BBB] = True
        if BibleOrgSysGlobals.verbosityLevel > 2 or BibleOrgSysGlobals.debugFlag:
            print(
                _("  USXXMLBible: Loading {} from {} from {}...").format(
                    BBB, self.name, self.sourceFolder))
        if filename is None: filename = self.possibleFilenameDict[BBB]
        UBB = USXXMLBibleBook(self, BBB)
        UBB.load(filename, self.givenFolderName, self.encoding)
        UBB.validateMarkers()
        #for j, something in enumerate( UBB._processedLines ):
        #print( j, something )
        #if j > 100: break
        #for j, something in enumerate( sorted(UBB._CVIndex) ):
        #print( j, something )
        #if j > 50: break
        #halt
        self.saveBook(UBB)
        #return UBB

    # end of USXXMLBible.loadBook

    def load(self):
        """
        Load the books.
        """
        def loadSSFData(ssfFilepath, encoding='utf-8'):
            """Process the SSF data from the given filepath.
                Returns a dictionary."""
            if BibleOrgSysGlobals.verbosityLevel > 2:
                print(_("Loading SSF data from {!r}").format(ssfFilepath))
            lastLine, lineCount, status, settingsDict = '', 0, 0, {}
            with open(ssfFilepath, encoding=encoding
                      ) as myFile:  # Automatically closes the file when done
                for line in myFile:
                    lineCount += 1
                    if lineCount == 1 and line and line[0] == chr(
                            65279):  #U+FEFF
                        logging.info(
                            "USXXMLBible.load: Detected UTF-16 Byte Order Marker in {}"
                            .format(ssfFilepath))
                        line = line[1:]  # Remove the Byte Order Marker
                    if line[-1] == '\n':
                        line = line[:-1]  # Remove trailing newline character
                    line = line.strip(
                    )  # Remove leading and trailing whitespace
                    if not line: continue  # Just discard blank lines
                    lastLine = line
                    processed = False
                    if status == 0 and line == "<ScriptureText>":
                        status = 1
                        processed = True
                    elif status == 1 and line == "</ScriptureText>":
                        status = 2
                        processed = True
                    elif status == 1 and line[0] == '<' and line.endswith(
                            '/>'):  # Handle a self-closing (empty) field
                        fieldname = line[1:-3] if line.endswith(
                            ' />') else line[
                                1:-2]  # Handle it with or without a space
                        if ' ' not in fieldname:
                            settingsDict[fieldname] = ''
                            processed = True
                        elif ' ' in fieldname:  # Some fields (like "Naming") may contain attributes
                            bits = fieldname.split(None, 1)
                            assert (len(bits) == 2)
                            fieldname = bits[0]
                            attributes = bits[1]
                            #print( "attributes = {!r}".format( attributes) )
                            settingsDict[fieldname] = (contents, attributes)
                            processed = True
                    elif status == 1 and line[0] == '<' and line[-1] == '>':
                        ix1 = line.find('>')
                        ix2 = line.find('</')
                        if ix1 != -1 and ix2 != -1 and ix2 > ix1:
                            fieldname = line[1:ix1]
                            contents = line[ix1 + 1:ix2]
                            if ' ' not in fieldname and line[
                                    ix2 + 2:-1] == fieldname:
                                settingsDict[fieldname] = contents
                                processed = True
                            elif ' ' in fieldname:  # Some fields (like "Naming") may contain attributes
                                bits = fieldname.split(None, 1)
                                assert (len(bits) == 2)
                                fieldname = bits[0]
                                attributes = bits[1]
                                #print( "attributes = {!r}".format( attributes) )
                                if line[ix2 + 2:-1] == fieldname:
                                    settingsDict[fieldname] = (contents,
                                                               attributes)
                                    processed = True
                    if not processed:
                        logging.error(
                            "Unexpected {!r} line in SSF file".format(line))
            if BibleOrgSysGlobals.verbosityLevel > 2:
                print("  " +
                      _("Got {} SSF entries:").format(len(settingsDict)))
                if BibleOrgSysGlobals.verbosityLevel > 3:
                    for key in sorted(settingsDict):
                        print("    {}: {}".format(key, settingsDict[key]))
            self.ssfDict = settingsDict  # We'll keep a copy of just the SSF settings
            self.settingsDict = settingsDict.copy(
            )  # This will be all the combined settings

        # end of loadSSFData

        if BibleOrgSysGlobals.verbosityLevel > 1:
            print(
                _("USXXMLBible: Loading {} from {}...").format(
                    self.name, self.givenFolderName))

        # Do a preliminary check on the contents of our folder
        foundFiles, foundFolders = [], []
        for something in os.listdir(self.givenFolderName):
            somepath = os.path.join(self.givenFolderName, something)
            if os.path.isdir(somepath): foundFolders.append(something)
            elif os.path.isfile(somepath): foundFiles.append(something)
            else:
                logging.error("Not sure what {!r} is in {}!".format(
                    somepath, self.givenFolderName))
        if foundFolders:
            logging.info(
                "USXXMLBible.load: Surprised to see subfolders in {!r}: {}".
                format(self.givenFolderName, foundFolders))
        if not foundFiles:
            if BibleOrgSysGlobals.verbosityLevel > 0:
                print(
                    "USXXMLBible.load: Couldn't find any files in {!r}".format(
                        self.givenFolderName))
            return  # No use continuing

        if 0:  # We don't have a getSSFFilenames function
            # Attempt to load the metadata file
            ssfFilepathList = self.USXFilenamesObject.getSSFFilenames(
                searchAbove=True, auto=True)
            if len(ssfFilepathList) == 1:  # Seems we found the right one
                loadSSFData(ssfFilepathList[0])

        # Load the books one by one -- assuming that they have regular Paratext style filenames
        # DON'T KNOW WHY THIS DOESN'T WORK
        if 0 and BibleOrgSysGlobals.maxProcesses > 1:  # Load all the books as quickly as possible
            parameters = []
            for BBB, filename in self.USXFilenamesObject.getConfirmedFilenames(
            ):
                parameters.append(BBB)
            #print( "parameters", parameters )
            with multiprocessing.Pool(processes=BibleOrgSysGlobals.maxProcesses
                                      ) as pool:  # start worker processes
                results = pool.map(self.loadBook,
                                   parameters)  # have the pool do our loads
                print("results", results)
                assert (len(results) == len(parameters))
                for j, UBB in enumerate(results):
                    BBB = parameters[j]
                    self.books[BBB] = UBB
                    # Make up our book name dictionaries while we're at it
                    assumedBookNames = UBB.getAssumedBookNames()
                    for assumedBookName in assumedBookNames:
                        self.BBBToNameDict[BBB] = assumedBookName
                        assumedBookNameLower = assumedBookName.lower()
                        self.bookNameDict[
                            assumedBookNameLower] = BBB  # Store the deduced book name (just lower case)
                        self.combinedBookNameDict[
                            assumedBookNameLower] = BBB  # Store the deduced book name (just lower case)
                        if ' ' in assumedBookNameLower:
                            self.combinedBookNameDict[
                                assumedBookNameLower.replace(
                                    ' ', ''
                                )] = BBB  # Store the deduced book name (lower case without spaces)
        else:  # Just single threaded
            for BBB, filename in self.USXFilenamesObject.getConfirmedFilenames(
            ):
                UBB = USXXMLBibleBook(self, BBB)
                UBB.load(filename, self.givenFolderName, self.encoding)
                UBB.validateMarkers()
                #print( UBB )
                self.saveBook(UBB)
                #self.books[BBB] = UBB
                ## Make up our book name dictionaries while we're at it
                #assumedBookNames = UBB.getAssumedBookNames()
                #for assumedBookName in assumedBookNames:
                #self.BBBToNameDict[BBB] = assumedBookName
                #assumedBookNameLower = assumedBookName.lower()
                #self.bookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case)
                #self.combinedBookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case)
                #if ' ' in assumedBookNameLower: self.combinedBookNameDict[assumedBookNameLower.replace(' ','')] = BBB # Store the deduced book name (lower case without spaces)

        if not self.books:  # Didn't successfully load any regularly named books -- maybe the files have weird names??? -- try to be intelligent here
            if BibleOrgSysGlobals.verbosityLevel > 2:
                print(
                    "USXXMLBible.load: Didn't find any regularly named USX files in {!r}"
                    .format(self.givenFolderName))
            for thisFilename in foundFiles:
                # Look for BBB in the ID line (which should be the first line in a USX file)
                isUSX = False
                thisPath = os.path.join(self.givenFolderName, thisFilename)
                with open(
                        thisPath
                ) as possibleUSXFile:  # Automatically closes the file when done
                    for line in possibleUSXFile:
                        if line.startswith('\\id '):
                            USXId = line[4:].strip(
                            )[:
                              3]  # Take the first three non-blank characters after the space after id
                            if BibleOrgSysGlobals.verbosityLevel > 2:
                                print(
                                    "Have possible USX ID {!r}".format(USXId))
                            BBB = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromUSFM(
                                USXId)
                            if BibleOrgSysGlobals.verbosityLevel > 2:
                                print("BBB is {!r}".format(BBB))
                            isUSX = True
                        break  # We only look at the first line
                if isUSX:
                    UBB = USXXMLBibleBook(self, BBB)
                    UBB.load(self.givenFolderName, thisFilename, self.encoding)
                    UBB.validateMarkers()
                    print(UBB)
                    self.books[BBB] = UBB
                    # Make up our book name dictionaries while we're at it
                    assumedBookNames = UBB.getAssumedBookNames()
                    for assumedBookName in assumedBookNames:
                        self.BBBToNameDict[BBB] = assumedBookName
                        assumedBookNameLower = assumedBookName.lower()
                        self.bookNameDict[
                            assumedBookNameLower] = BBB  # Store the deduced book name (just lower case)
                        self.combinedBookNameDict[
                            assumedBookNameLower] = BBB  # Store the deduced book name (just lower case)
                        if ' ' in assumedBookNameLower:
                            self.combinedBookNameDict[
                                assumedBookNameLower.replace(
                                    ' ', ''
                                )] = BBB  # Store the deduced book name (lower case without spaces)
            if self.books:
                print("USXXMLBible.load: Found {} irregularly named USX files".
                      format(len(self.books)))
        self.doPostLoadProcessing()
Пример #10
0
class USXXMLBible( Bible ):
    """
    Class to load and manipulate USX Bibles.

    """
    def __init__( self, givenFolderName, givenName=None, encoding='utf-8' ):
        """
        Create the internal USX Bible object.
        """
         # Setup and initialise the base class first
        Bible.__init__( self )
        self.objectNameString = 'USX XML Bible object'
        self.objectTypeString = 'USX'

        self.givenFolderName, self.givenName, self.encoding = givenFolderName, givenName, encoding # Remember our parameters
        self.sourceFolder = self.givenFolderName

        # Now we can set our object variables
        self.name = self.givenName
        if not self.name: self.name = os.path.basename( self.givenFolderName )
        if not self.name: self.name = os.path.basename( self.givenFolderName[:-1] ) # Remove the final slash
        if not self.name: self.name = "USX Bible"

        self.ssfFilepath = None
    # end of USXXMLBible.__init_


    def preload( self ):
        """
        Tries to determine USX filename pattern.
        """
        if BibleOrgSysGlobals.debugFlag or BibleOrgSysGlobals.verbosityLevel > 2:
            print( exp("preload() from {}").format( self.sourceFolder ) )

        # Do a preliminary check on the readability of our folder
        if not os.access( self.givenFolderName, os.R_OK ):
            logging.error( "USXXMLBible: File {!r} is unreadable".format( self.givenFolderName ) )

        # Find the filenames of all our books
        self.USXFilenamesObject = USXFilenames( self.givenFolderName )
        #print( "DDFSDF", self.USXFilenamesObject )
        #print( "DFSFGE", self.USXFilenamesObject.getPossibleFilenameTuples() )
        #print( "SDFSDQ", self.USXFilenamesObject.getConfirmedFilenameTuples() )
        self.possibleFilenameDict = OrderedDict()
        filenameTuples = self.USXFilenamesObject.getConfirmedFilenameTuples()
        if not filenameTuples: # Try again
            filenameTuples = self.USXFilenamesObject.getPossibleFilenameTuples()
        for BBB,filename in filenameTuples:
            self.possibleFilenameDict[BBB] = filename
        #print( "GHJGHR", self.possibleFilenameDict ); halt

        if 0: # we don't have a getSSFFilenames function :(
            if self.suppliedMetadata is None: self.suppliedMetadata = {}
            if self.ssfFilepath is None: # it might have been loaded first
                # Attempt to load the SSF file
                #self.suppliedMetadata, self.settingsDict = {}, {}
                ssfFilepathList = self.USXFilenamesObject.getSSFFilenames( searchAbove=True, auto=True )
                #print( "ssfFilepathList", ssfFilepathList )
                if len(ssfFilepathList) > 1:
                    logging.error( exp("preload: Found multiple possible SSF files -- using first one: {}").format( ssfFilepathList ) )
                if len(ssfFilepathList) >= 1: # Seems we found the right one
                    PTXSettingsDict = loadPTX7ProjectData( self, ssfFilepathList[0] )
                    if PTXSettingsDict:
                        if 'PTX' not in self.suppliedMetadata: self.suppliedMetadata['PTX'] = {}
                        self.suppliedMetadata['PTX']['SSF'] = PTXSettingsDict
                        self.applySuppliedMetadata( 'SSF' ) # Copy some to BibleObject.settingsDict

        #self.name = self.givenName
        #if self.name is None:
            #for field in ('FullName','Name',):
                #if field in self.settingsDict: self.name = self.settingsDict[field]; break
        #if not self.name: self.name = os.path.basename( self.sourceFolder )
        #if not self.name: self.name = os.path.basename( self.sourceFolder[:-1] ) # Remove the final slash
        #if not self.name: self.name = "USFM Bible"

        self.preloadDone = True
    # end of USFMBible.preload


    def loadBook( self, BBB, filename=None ):
        """
        NOTE: You should ensure that preload() has been called first.
        """
        if BibleOrgSysGlobals.debugFlag and debuggingThisModule:
            print( "USXXMLBible.loadBook( {}, {} )".format( BBB, filename ) )
            assert self.preloadDone

        if BBB not in self.bookNeedsReloading or not self.bookNeedsReloading[BBB]:
            if BBB in self.books:
                if BibleOrgSysGlobals.debugFlag: print( "  {} is already loaded -- returning".format( BBB ) )
                return # Already loaded
            if BBB in self.triedLoadingBook:
                logging.warning( "We had already tried loading USX {} for {}".format( BBB, self.name ) )
                return # We've already attempted to load this book
        self.triedLoadingBook[BBB] = True

        if BibleOrgSysGlobals.verbosityLevel > 2 or BibleOrgSysGlobals.debugFlag: print( _("  USXXMLBible: Loading {} from {} from {}…").format( BBB, self.name, self.sourceFolder ) )
        if filename is None: filename = self.possibleFilenameDict[BBB]
        UBB = USXXMLBibleBook( self, BBB )
        UBB.load( filename, self.givenFolderName, self.encoding )
        UBB.validateMarkers()
        #for j, something in enumerate( UBB._processedLines ):
            #print( j, something )
            #if j > 100: break
        #for j, something in enumerate( sorted(UBB._CVIndex) ):
            #print( j, something )
            #if j > 50: break
        #halt
        self.stashBook( UBB )
        self.bookNeedsReloading[BBB] = False
    # end of USXXMLBible.loadBook


    def _loadBookMP( self, BBB, filename=None ):
        """
        Used for multiprocessing.

        NOTE: You should ensure that preload() has been called first.
        """
        if BibleOrgSysGlobals.debugFlag and debuggingThisModule:
            print( "USXXMLBible._loadBookMP( {}, {} )".format( BBB, filename ) )
            assert self.preloadDone

        if BBB in self.books: return # Already loaded
        if BBB in self.triedLoadingBook:
            logging.warning( "We had already tried loading USX {} for {}".format( BBB, self.name ) )
            return # We've already attempted to load this book
        self.triedLoadingBook[BBB] = True

        if BibleOrgSysGlobals.verbosityLevel > 2 or BibleOrgSysGlobals.debugFlag: print( _("  USXXMLBible: Loading {} from {} from {}…").format( BBB, self.name, self.sourceFolder ) )
        if filename is None: filename = self.possibleFilenameDict[BBB]
        UBB = USXXMLBibleBook( self, BBB )
        UBB.load( filename, self.givenFolderName, self.encoding )
        UBB.validateMarkers()
        #for j, something in enumerate( UBB._processedLines ):
            #print( j, something )
            #if j > 100: break
        #for j, something in enumerate( sorted(UBB._CVIndex) ):
            #print( j, something )
            #if j > 50: break
        #halt
        return UBB
    # end of USXXMLBible._loadBookMP


    def loadBooks( self ):
        """
        Load the books.
        """
        if BibleOrgSysGlobals.verbosityLevel > 1:
            print( _("USXXMLBible: Loading {} books from {}…").format( self.name, self.givenFolderName ) )

        if not self.preloadDone: self.preload()

        # Do a preliminary check on the contents of our folder
        foundFiles, foundFolders = [], []
        for something in os.listdir( self.givenFolderName ):
            somepath = os.path.join( self.givenFolderName, something )
            if os.path.isdir( somepath ): foundFolders.append( something )
            elif os.path.isfile( somepath ): foundFiles.append( something )
            else: logging.error( "Not sure what {!r} is in {}!".format( somepath, self.givenFolderName ) )
        if foundFolders: logging.info( "USXXMLBible.loadBooks: Surprised to see subfolders in {!r}: {}".format( self.givenFolderName, foundFolders ) )
        if not foundFiles:
            if BibleOrgSysGlobals.verbosityLevel > 0: print( "USXXMLBible.loadBooks: Couldn't find any files in {!r}".format( self.givenFolderName ) )
            return # No use continuing

        #if 0: # We don't have a getSSFFilenames function
            ## Attempt to load the metadata file
            #ssfFilepathList = self.USXFilenamesObject.getSSFFilenames( searchAbove=True, auto=True )
            #if len(ssfFilepathList) == 1: # Seems we found the right one
                #PTXSettingsDict = loadPTX7ProjectData( ssfFilepathList[0] )
                #if PTXSettingsDict:
                    #if 'PTX' not in self.suppliedMetadata: self.suppliedMetadata['PTX'] = {}
                    #self.suppliedMetadata['PTX']['SSF'] = PTXSettingsDict
                    #self.applySuppliedMetadata( 'SSF' ) # Copy some to BibleObject.settingsDict

        # Load the books one by one -- assuming that they have regular Paratext style filenames
        if BibleOrgSysGlobals.maxProcesses > 1: # Load all the books as quickly as possible
            parameters = []
            for BBB,filename in self.USXFilenamesObject.getConfirmedFilenameTuples():
                parameters.append( BBB )
            #print( "parameters", parameters )
            if BibleOrgSysGlobals.verbosityLevel > 1:
                print( _("Loading {} {} books using {} CPUs…").format( len(parameters), 'USX', BibleOrgSysGlobals.maxProcesses ) )
                print( _("  NOTE: Outputs (including error and warning messages) from loading various books may be interspersed.") )
            with multiprocessing.Pool( processes=BibleOrgSysGlobals.maxProcesses ) as pool: # start worker processes
                results = pool.map( self._loadBookMP, parameters ) # have the pool do our loads
                #print( "results", results )
                #assert len(results) == len(parameters)
                for j, UBB in enumerate( results ):
                    BBB = parameters[j]
                    #self.books[BBB] = UBB
                    self.stashBook( UBB )
                    # Make up our book name dictionaries while we're at it
                    assumedBookNames = UBB.getAssumedBookNames()
                    for assumedBookName in assumedBookNames:
                        self.BBBToNameDict[BBB] = assumedBookName
                        assumedBookNameLower = assumedBookName.lower()
                        self.bookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case)
                        self.combinedBookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case)
                        if ' ' in assumedBookNameLower: self.combinedBookNameDict[assumedBookNameLower.replace(' ','')] = BBB # Store the deduced book name (lower case without spaces)
        else: # Just single threaded
            #print( self.USXFilenamesObject.getConfirmedFilenameTuples() ); halt
            for BBB,filename in self.possibleFilenameDict.items():
                self.loadBook( BBB, filename ) # also saves it
                #UBB = USXXMLBibleBook( self, BBB )
                #UBB.load( filename, self.givenFolderName, self.encoding )
                #UBB.validateMarkers()
                #print( UBB )
                #self.stashBook( UBB )

        if not self.books: # Didn't successfully load any regularly named books -- maybe the files have weird names??? -- try to be intelligent here
            if BibleOrgSysGlobals.verbosityLevel > 2:
                print( "USXXMLBible.loadBooks: Didn't find any regularly named USX files in {!r}".format( self.givenFolderName ) )
            for thisFilename in foundFiles:
                # Look for BBB in the ID line (which should be the first line in a USX file)
                isUSX = False
                thisPath = os.path.join( self.givenFolderName, thisFilename )
                try:
                    with open( thisPath ) as possibleUSXFile: # Automatically closes the file when done
                        for line in possibleUSXFile:
                            if line.startswith( '\\id ' ):
                                USXId = line[4:].strip()[:3] # Take the first three non-blank characters after the space after id
                                if BibleOrgSysGlobals.verbosityLevel > 2: print( "Have possible USX ID {!r}".format( USXId ) )
                                BBB = BibleOrgSysGlobals.BibleBooksCodes.getBBBFromUSFM( USXId )
                                if BibleOrgSysGlobals.verbosityLevel > 2: print( "BBB is {!r}".format( BBB ) )
                                isUSX = True
                            break # We only look at the first line
                except UnicodeDecodeError: isUSX = False
                if isUSX:
                    UBB = USXXMLBibleBook( self, BBB )
                    UBB.load( self.givenFolderName, thisFilename, self.encoding )
                    UBB.validateMarkers()
                    print( UBB )
                    self.books[BBB] = UBB
                    # Make up our book name dictionaries while we're at it
                    assumedBookNames = UBB.getAssumedBookNames()
                    for assumedBookName in assumedBookNames:
                        self.BBBToNameDict[BBB] = assumedBookName
                        assumedBookNameLower = assumedBookName.lower()
                        self.bookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case)
                        self.combinedBookNameDict[assumedBookNameLower] = BBB # Store the deduced book name (just lower case)
                        if ' ' in assumedBookNameLower: self.combinedBookNameDict[assumedBookNameLower.replace(' ','')] = BBB # Store the deduced book name (lower case without spaces)
            if self.books: print( "USXXMLBible.loadBooks: Found {} irregularly named USX files".format( len(self.books) ) )
        self.doPostLoadProcessing()
    # end of USXXMLBible.loadBooks

    def load( self ):
        self.loadBooks()