def processFilesInDirectory(dirName, anselUnicodeConverter = None, commitNonblocking =0, numThreads = 1, deleteAfterIndexing = 1): """processes MARC and PMS files in the indexer-queue directory. If numThreads > 1 it will try to parallelize the MARC record processing (but not PMS indexing -- no reason for that) """ pmsFiles = glob.glob( "%s/PMS*.DAT" % dirName ) updatedAnyRecords = 0 count= 0 for fileOn in pmsFiles: print "processing PMS file %s" % fileOn processedFilenameOn, deletedBibsOn = processPMSFile(fileOn) if processedFilenameOn: print "processing MARC file %s" % processedFilenameOn indexerDriver.processFile( processedFilenameOn, anselUnicodeConverter ) # now that we are done processing the file, we delete it. print "deleting MARC file %s " % processedFilenameOn os.remove( processedFilenameOn ) print "deleting PMS file %s" % fileOn os.remove( fileOn ) if deletedBibsOn: print "processing deleted bibs from MARC file %s" % processedFilenameOn for bibOn in deletedBibsOn: print "deleting bib %s" % bibOn indexerDriver.deleteRecord( bibOn ) updatedAnyRecords = 1 else: print "no records to index" os.remove( fileOn ) print "[%s] now checking for MARC files" % time.ctime() _marcFiles = glob.glob( "%s/*.MARC" % dirName) _marcFiles += glob.glob ("%s/*.marc" % dirName ) _marcFiles += glob.glob ("%s/*.dat" % dirName ) _marcFiles += glob.glob ("%s/*.DAT" % dirName ) _marcFiles += glob.glob( "%s/*scriblio*" % dirName ) # dedupe _marcFiles here incase a file matches more than one glob # using a dictionary is the fastest way to dedupe a list with Jython marcFileDict = {} for fileOn in _marcFiles: marcFileDict[fileOn] = None marcFiles = marcFileDict.keys() marcFiles.sort() numMarcFiles = len(marcFiles) print "[%s] found %d files to process." % (time.ctime(), numMarcFiles ) if numThreads == 1: for fileOn in marcFiles: print "processing MARC file %s" % fileOn count = indexerDriver.processFile( fileOn, anselUnicodeConverter, nonblocking=1 ) # csdebug: added nonblocking here updatedAnyRecords = 1 if deleteAfterIndexing: os.remove( fileOn ) elif numThreads >= numMarcFiles: # spin off a thread for each one # was getting weird problems with multithreading (AttributeErrors when trying to iterate # over all controlFields in MARC record -- trying separate anselUnicodeConverters to see if that's the issue. threads = [] threadrefs = [] i = 0 for fileOn in marcFiles: convOn = AnselToUnicode() jobOn = indexerDriver.processFileJob( fileOn, convOn, nonblocking = 1, pid = i)# csdebug: handle nonblocking option _threadOn = Thread( jobOn, "process file job %s" %i ) threads.append( _threadOn ) threadrefs.append( jobOn ) print "starting thread %s processing file %s" % (i, fileOn) _threadOn.start() i += 1 updatedAnyRecords = 1 print "joining threads" for i in range( len(threads) ): threads[i].join() # TODO: make sure the thread was successful before nuking. if deleteAfterIndexing: print "deleting %s" % threadrefs[i].filename os.remove( threadrefs[i].filename ) else: # do work queue here. print "not yet implemented" # finally, do a commit here. if updatedAnyRecords: print "[%s] starting final commit" % time.ctime() if commitNonblocking: solrConnection.commitNonblocking() else: solrConnection.commit() print "[%s] done committing" % time.ctime() return count
fOut = open( fOutName, "w" ) fOut.write( data ) fOut.flush() fOut.close() else: fOutName = None return fOutName, deletedBibs if __name__ == '__main__': pmsFiles = glob.glob( "%s/PMS*.DAT" % HORIZON_BASE_DIR ) for fileOn in pmsFiles: print "processing PMS file %s" % fileOn processedFilenameOn, deletedBibsOn = processPMSFile(fileOn) if processedFilenameOn: print "processing MARC file %s" % processedFilenameOn indexerDriver.processFile( processedFilenameOn ) # now that we are done processing the file, we delete it. print "deleting MARC file %s " % processedFilenameOn os.remove( processedFilenameOn ) print "deleting PMS file %s" % fileOn os.remove( fileOn ) if deletedBibsOn: print "processing deleted bibs from MARC file %s" % processedFilenameOn for bibOn in deletedBibsOn: print "deleting bib %s" % bibOn indexerDriver.deleteRecord( bibOn ) else: print "no records to index" os.remove( fileOn )
def processFilesInDirectory(dirName, anselUnicodeConverter=None, commitNonblocking=0, numThreads=1, deleteAfterIndexing=1): """processes MARC and PMS files in the indexer-queue directory. If numThreads > 1 it will try to parallelize the MARC record processing (but not PMS indexing -- no reason for that) """ pmsFiles = glob.glob("%s/PMS*.DAT" % dirName) updatedAnyRecords = 0 count = 0 for fileOn in pmsFiles: print "processing PMS file %s" % fileOn processedFilenameOn, deletedBibsOn = processPMSFile(fileOn) if processedFilenameOn: print "processing MARC file %s" % processedFilenameOn indexerDriver.processFile(processedFilenameOn, anselUnicodeConverter) # now that we are done processing the file, we delete it. print "deleting MARC file %s " % processedFilenameOn os.remove(processedFilenameOn) print "deleting PMS file %s" % fileOn os.remove(fileOn) if deletedBibsOn: print "processing deleted bibs from MARC file %s" % processedFilenameOn for bibOn in deletedBibsOn: print "deleting bib %s" % bibOn indexerDriver.deleteRecord(bibOn) updatedAnyRecords = 1 else: print "no records to index" os.remove(fileOn) print "[%s] now checking for MARC files" % time.ctime() _marcFiles = glob.glob("%s/*.MARC" % dirName) _marcFiles += glob.glob("%s/*.marc" % dirName) _marcFiles += glob.glob("%s/*.dat" % dirName) _marcFiles += glob.glob("%s/*.DAT" % dirName) _marcFiles += glob.glob("%s/*scriblio*" % dirName) # dedupe _marcFiles here incase a file matches more than one glob # using a dictionary is the fastest way to dedupe a list with Jython marcFileDict = {} for fileOn in _marcFiles: marcFileDict[fileOn] = None marcFiles = marcFileDict.keys() marcFiles.sort() numMarcFiles = len(marcFiles) print "[%s] found %d files to process." % (time.ctime(), numMarcFiles) if numThreads == 1: for fileOn in marcFiles: print "processing MARC file %s" % fileOn count = indexerDriver.processFile( fileOn, anselUnicodeConverter, nonblocking=1) # csdebug: added nonblocking here updatedAnyRecords = 1 if deleteAfterIndexing: os.remove(fileOn) elif numThreads >= numMarcFiles: # spin off a thread for each one # was getting weird problems with multithreading (AttributeErrors when trying to iterate # over all controlFields in MARC record -- trying separate anselUnicodeConverters to see if that's the issue. threads = [] threadrefs = [] i = 0 for fileOn in marcFiles: convOn = AnselToUnicode() jobOn = indexerDriver.processFileJob( fileOn, convOn, nonblocking=1, pid=i) # csdebug: handle nonblocking option _threadOn = Thread(jobOn, "process file job %s" % i) threads.append(_threadOn) threadrefs.append(jobOn) print "starting thread %s processing file %s" % (i, fileOn) _threadOn.start() i += 1 updatedAnyRecords = 1 print "joining threads" for i in range(len(threads)): threads[i].join() # TODO: make sure the thread was successful before nuking. if deleteAfterIndexing: print "deleting %s" % threadrefs[i].filename os.remove(threadrefs[i].filename) else: # do work queue here. print "not yet implemented" # finally, do a commit here. if updatedAnyRecords: print "[%s] starting final commit" % time.ctime() if commitNonblocking: solrConnection.commitNonblocking() else: solrConnection.commit() print "[%s] done committing" % time.ctime() return count
fOut.write(data) fOut.flush() fOut.close() else: fOutName = None return fOutName, deletedBibs if __name__ == '__main__': pmsFiles = glob.glob("%s/PMS*.DAT" % HORIZON_BASE_DIR) for fileOn in pmsFiles: print "processing PMS file %s" % fileOn processedFilenameOn, deletedBibsOn = processPMSFile(fileOn) if processedFilenameOn: print "processing MARC file %s" % processedFilenameOn indexerDriver.processFile(processedFilenameOn) # now that we are done processing the file, we delete it. print "deleting MARC file %s " % processedFilenameOn os.remove(processedFilenameOn) print "deleting PMS file %s" % fileOn os.remove(fileOn) if deletedBibsOn: print "processing deleted bibs from MARC file %s" % processedFilenameOn for bibOn in deletedBibsOn: print "deleting bib %s" % bibOn indexerDriver.deleteRecord(bibOn) else: print "no records to index" os.remove(fileOn)