def __init__(self, marcRecord, anselUnicodeConverter = None, accession = None, config = None, indexerProperties="config/indexes.properties"): start = time.time() self._marcRecordToDictTime = 0 self._extractionTime = 0 self._extractorCreateTime = 0 self.bib_num = None if anselUnicodeConverter is None: anselUnicodeConverter = AnselToUnicode() record = marcRecordToDict( marcRecord, anselUnicodeConverter ) self._marcRecordToDictTime = time.time() - start start = time.time() if config is None: config = loadPropsFile(indexerProperties) extractor = MarcExtractor( record ) self._extractorCreateTime = (time.time() - start) self.marc_record = str( record ) fieldsToDo = [x.strip() for x in config['active.fields'].split(",")] _processors = __import__('processors', {},{},['']) for fieldOn in fieldsToDo: start = time.time() processorNameOn = config.get( "%s.processor" % fieldOn, "standardProcessor" ) marcMapOn = config.get("%s.marcMap" % fieldOn, None) # do processing if processorNameOn == "standardProcessor": # then just use the MARC extractor separatorOn = config.get("%s.marcMap.separator" % fieldOn, " ") stripTrailingPunctuation = int( config.get("%s.stripTrailingPunctuation" % fieldOn, "0") ) processedResult = extractor.extract( marcMapOn, separator = separatorOn, stripTrailingPunctuation = stripTrailingPunctuation ) if ((processedResult == None) or len(processedResult) == 0) and config.has_key("%s.marcMap.lastResort" % fieldOn ): marcMapOn = config.get("%s.marcMap.lastResort" % fieldOn, None) processedResult = extractor.extract( marcMapOn, separator = separatorOn ) else: processorOn = getattr( _processors, processorNameOn ) processedResult = processorOn( record, marcMap=marcMapOn, extractor=extractor ) # do post-processing based upon type typeOn = config.get("%s.type" % fieldOn, "multi") if typeOn == "single" and ( type(processedResult) == type([])) and len(processedResult) > 0: postProcessedResult = processedResult[0] elif typeOn == "singleTranslation": if( type(processedResult) == type([]) ): if len(processedResult) >= 1: processedResult = processedResult[0] else: processedResult = None translationMapName = config.get("%s.translationMap" % fieldOn, None) if translationMapName is not None: _translationMapModule = __import__( "config.codes" , {},{},[''] ) _translationMap = getattr( _translationMapModule, translationMapName) postProcessedResult = _translationMap.get( processedResult, None) else: postProcessedResult = processedResult # set own attribute if postProcessedResult is not None and len(postProcessedResult) > 0: setattr( self, fieldOn, postProcessedResult ) self._extractionTime += ( time.time() - start )
# GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Helios. If not, see <http://www.gnu.org/licenses/>. ## indexes records from solr from java.io import * from java.net import * #from org.marc4j import * #from org.marc4j.converter.impl import * import time, sys, urllib #import csv #from indexer import * from loadPropsFile import * indexerConfig = loadPropsFile("config/indexer.properties") SOLR_COMMIT_MESSAGE = indexerConfig.get("SOLR_COMMIT_MESSAGE") SOLR_OPTIMIZE_MESSAGE = indexerConfig.get("SOLR_OPTIMIZE_MESSAGE") SOLR_DELETE_ID_MESSAGE = indexerConfig.get("SOLR_DELETE_ID_MESSAGE") SOLR_BASE_URL = indexerConfig.get("SOLR_BASE_URL", "http://localhost:8983/solr") SOLR_UPDATE_URL = indexerConfig.get("SOLR_UPDATE_URL") SOLR_QUERY_URL = indexerConfig.get("SOLR_QUERY_URL") MAX_RECORDS_TO_ADD = indexerConfig.get("MAX_RECORDS_TO_ADD") SOLR_INDEX_BATCH_SIZE = indexerConfig.get("SOLR_INDEX_BATCH_SIZE") SOLR_COMMIT_BATCH_SIZE = indexerConfig.get("SOLR_COMMIT_BATCH_SIZE") PRINT_SOLR_POST_DATA = indexerConfig.get("PRINT_SOLR_POST_DATA") PRINT_SOLR_RESP_DATA = indexerConfig.get("PRINT_SOLR_RESP_DATA") PROFILE = indexerConfig.get("PROFILE") DO_ACCESSION = indexerConfig.get("DO_ACCESSION") ## currently not used!
def __init__(self, marcRecord, anselUnicodeConverter = None, accession = None, profile=0, propsObject = None, indexerProperties="config/indexer.properties"): start = time.time() self._marcRecordToDictTime = 0 self._extractionTime = 0 self._extractorCreateTime = 0 self._extractMethodTime = 0 if profile: self._marcRecordToDictProfiling = {} if anselUnicodeConverter is None: print "creating ansel -> unicode converter" # csdebug anselUnicodeConverter = AnselToUnicode() if profile: record, _perfData = marcIndexingUtils.marcRecordToDict( marcRecord, anselUnicodeConverter ) self._marcRecordToDictProfiling = _perfData else: record = marcIndexingUtils.marcRecordToDict( marcRecord, anselUnicodeConverter ) self._marcRecordToDictTime = time.time() - start start = time.time() if not propsObject: config = loadPropsFile(indexerProperties) else: config = propsObject extractor = MarcExtractor( record ) self._extractorCreateTime = (time.time() - start) # TODO: decide if this should be turn-offable or not self.marc_record = str( record ) fieldsToDo = [x.strip() for x in config['active.fields'].split(",")] _processors = __import__('processors', {},{},['']) for fieldOn in fieldsToDo: start = time.time() processorNameOn = config.get( "%s.processor" % fieldOn, "standardProcessor" ) marcMapOn = config.get("%s.marcMap" % fieldOn, None) # do processing if processorNameOn == "standardProcessor": # then just use the MARC extractor separatorOn = config.get("%s.marcMap.separator" % fieldOn, " ") stripTrailingCommas = int( config.get("%s.stripTrailingCommas" % fieldOn, "0") ) if stripTrailingCommas: extractMethodStart = time.time() processedResult = extractor.extract( marcMapOn, separator = separatorOn, trailingPunctuationToStrip = [","], stripTrailingPunctuation = 1 ) self._extractMethodTime += ( time.time() - extractMethodStart ) else: stripTrailingPunctuation = int( config.get("%s.stripTrailingPunctuation" % fieldOn, "0") ) try: extractMethodStart = time.time() processedResult = extractor.extract( marcMapOn, separator = separatorOn, stripTrailingPunctuation = stripTrailingPunctuation ) self._extractMethodTime += ( time.time() - extractMethodStart ) except AttributeError: print "You do not have a correct marc mapping set up for field %s" % fieldOn if ((processedResult == None) or len(processedResult) == 0) and config.has_key("%s.marcMap.lastResort" % fieldOn ): marcMapOn = config.get("%s.marcMap.lastResort" % fieldOn, None) extractMethodStart = time.time() processedResult = extractor.extract( marcMapOn, separator = separatorOn ) self._extractMethodTime += ( time.time() - extractMethodStart ) else: # get and run custom processor processorOn = getattr( _processors, processorNameOn ) processedResult = processorOn( record, marcMap=marcMapOn, extractor=extractor ) # do post-processing based upon type typeOn = config.get("%s.type" % fieldOn, "multi") if typeOn == "single" and ( type(processedResult) == type([])) and len(processedResult) > 0: postProcessedResult = processedResult[0] elif typeOn == "singleTranslation": if( type(processedResult) == type([]) ): if len(processedResult) >= 1: processedResult = processedResult[0] else: processedResult = None translationMapName = config.get("%s.translationMap" % fieldOn, None) if translationMapName is not None: _translationMapModule = __import__( "config.codes" , {},{},[''] ) _translationMap = getattr( _translationMapModule, translationMapName) postProcessedResult = _translationMap.get( processedResult, None) else: postProcessedResult = processedResult # deal with stripWhitespace after all other text manipulations stripWhitespace = int( config.get("%s.stripWhitespace" % fieldOn, "0") ) if stripWhitespace: if type( postProcessedResult ) == type(""): postProcessedResult = postProcessedResult.strip() elif type( postProcessedResult) == type([]): postProcessedResult = [x.strip() for x in postProcessedResult] # FINALLY, set own attribute if postProcessedResult is not None and len(postProcessedResult) > 0: setattr( self, fieldOn, postProcessedResult ) self._extractionTime += ( time.time() - start )
# GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Helios. If not, see <http://www.gnu.org/licenses/>. ## indexes records from solr from java.io import * from java.net import * from org.marc4j import * from org.marc4j.converter.impl import * import time, sys, urllib from indexer import * from loadPropsFile import * indexerConfig = loadPropsFile("config/indexer.properties") SOLR_COMMIT_MESSAGE = indexerConfig.get( "SOLR_COMMIT_MESSAGE" ) SOLR_OPTIMIZE_MESSAGE = indexerConfig.get( "SOLR_OPTIMIZE_MESSAGE" ) SOLR_DELETE_ID_MESSAGE = indexerConfig.get("SOLR_DELETE_ID_MESSAGE" ) SOLR_BASE_URL = indexerConfig.get("SOLR_BASE_URL", "http://localhost:8888/solr" ) SOLR_UPDATE_URL = indexerConfig.get( "SOLR_UPDATE_URL" ) SOLR_QUERY_URL = indexerConfig.get( "SOLR_QUERY_URL" ) MAX_RECORDS_TO_ADD = indexerConfig.get("MAX_RECORDS_TO_ADD") RECORDS_TO_SKIP = indexerConfig.get("RECORDS_TO_SKIP") SOLR_INDEX_BATCH_SIZE = indexerConfig.get("SOLR_INDEX_BATCH_SIZE") SOLR_COMMIT_BATCH_SIZE = indexerConfig.get("SOLR_COMMIT_BATCH_SIZE") PRINT_SOLR_POST_DATA = indexerConfig.get("PRINT_SOLR_POST_DATA") PRINT_SOLR_RESP_DATA = indexerConfig.get("PRINT_SOLR_RESP_DATA") PROFILE = indexerConfig.get("PROFILE") DO_ACCESSION = indexerConfig.get("DO_ACCESSION") ## currently not used!
def processFile( filename, anselUnicodeConverter = None ): inStream = FileInputStream(filename) print "processing file <<%s>>" % filename marcReader = MarcStreamReader( inStream ) data = "" count = 0 lastCommitTime = None import time startTime = time.time() lastRecord = None m4j = None marcReaderTime = 0 marcRecordToDictTime = 0 extractorCreateTime = 0 extractionTime = 0 marcRecordForSolrTime = 0 commitTime = 0 updateTime = 0 marcSerializeTime = 0 indexesConfig = loadPropsFile("config/indexes.properties") accession = 0 # TODO: try and load pickled accession # from somewhere serializedRecord = None while marcReader.hasNext() and count < MAX_RECORDS_TO_ADD: print ".", accession += 1 count += 1 try: mrTimeStart = time.time() marc4jRecord = marcReader.next() marcReaderTime += ( time.time() - mrTimeStart ) except: print "last record indexed was %s " % serializedRecord import sys print "sys.exc_info is %s" % str(sys.exc_info()) try: marc4jRecord = marcReader.next() # unlikely to work but what the hey except: print "tried parsing again and failed. The lesson is, never try." sys.exit(1) mrsTime = time.time() if count < RECORDS_TO_SKIP: continue rec = recordForSolr( marc4jRecord, anselUnicodeConverter, config = indexesConfig ) marcRecordForSolrTime += ( time.time() - mrsTime ) extractionTime += rec._extractionTime extractorCreateTime += rec._extractorCreateTime marcRecordToDictTime += rec._marcRecordToDictTime mrserTime = time.time() serializedRecord = rec.serialize() marcSerializeTime += ( time.time() - mrserTime ) if rec.bib_num is not None: data += serializedRecord if( (count % SOLR_INDEX_BATCH_SIZE ) == 0): # nb. neither apache commons nor python urllib works right here! Unicode gets mangled. Must use postURL startUpdateTime = time.time() resp = postURL( SOLR_UPDATE_URL, "<add>%s</add>" % data) # TODO: put in retry/continue code here for failed updates/slowdowns on Solr updateTime += ( time.time() - startUpdateTime ) print "*", if PRINT_SOLR_POST_DATA: print "\n\n<add>%s</add>\n\n" % data data = "" if( ( count % SOLR_COMMIT_BATCH_SIZE) == 0): try: print "committing..." beginCommitTime = time.time() resp = postURL( SOLR_UPDATE_URL, SOLR_COMMIT_MESSAGE) if PRINT_SOLR_RESP_DATA: print resp commitTime += ( time.time() - beginCommitTime ) except IOError: import time print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec." time.sleep(10) if lastCommitTime: thisBatchRate = ( ( 0.0 + SOLR_COMMIT_BATCH_SIZE) / (time.time() - lastCommitTime) ) overallRate = ( ( 0.0 + count ) / ( time.time() - startTime) ) print "[%s] %s records indexed\t| This Batch: %.4f records/sec|\tOverall: %.4f records/sec" % (time.ctime(), count, thisBatchRate, overallRate) if PROFILE: print """\nfile->MARC: %.4f\nMARC->py: %.4f\npy->XML: %.4f\n""" % ( marcReaderTime, marcRecordForSolrTime, marcSerializeTime ) print """MARC to dict: %.4f\ncreate extractor: %.4f\nextraction: %.4f\n""" % ( marcRecordToDictTime, extractorCreateTime, extractionTime ) print """Solr Update: %.4f\nSolr Commit: %.4f\n""" % ( updateTime, commitTime ) lastCommitTime = time.time() System.gc() # do last batch here if len(data) > 0: print "doing final POST" resp = postURL( SOLR_UPDATE_URL, "<add>%s</add>" % data) if PRINT_SOLR_RESP_DATA: print resp print "committing..." commit() inStream.close()
def processFile(filename, anselUnicodeConverter=None, nonblocking=0, pid=-1): # if nonblocking == 0 then all commits are blocking; if 1 then they are nonblocking. inStream = FileInputStream(filename) print "processFile>> %s" % filename marcReader = MarcStreamReader(inStream) data = "" count = 0 lastCommitTime = None import time startTime = time.time() lastRecord = None lastBibNum = None m4j = None marcReaderTime = 0 marcRecordToDictTime = 0 extractorCreateTime = 0 extractionTime = 0 extractMethodTime = 0 marcRecordForSolrTime = 0 commitTime = 0 updateTime = 0 marcSerializeTime = 0 accession = 0 # TODO: try and load serialized accession # from somewhere serializedRecord = None recordBatch = [] # get default properties file from loadPropsFile import * props = loadPropsFile(DEFAULT_INDEXING_PROPERTIES_FILE) while marcReader.hasNext() and count < NUM_RECORDS_TO_ADD: # if pid > -1: # print (".%d" % pid), # else: # print ".", # CSDEBUG accession += 1 count += 1 # TODO: improve error handling here (main problem is that Marc4J will fall over # at the sight of a bad record and there's no way to get it to just skip over # a bad record -- so there is little we can do, except better error messages! try: mrTimeStart = time.time() marc4jRecord = marcReader.next() marcReaderTime += time.time() - mrTimeStart except: print "last record indexed was bib# %s " % lastBibNum import sys print "sys.exc_info is %s" % str(sys.exc_info()) sys.exit(1) mrsTime = time.time() # try: rec = solrIndexingUtils.recordForSolr(marc4jRecord, anselUnicodeConverter, propsObject=props) # except: # print "exception processing record, skipping" # TODO: error handling # continue marcRecordForSolrTime += time.time() - mrsTime extractionTime += rec._extractionTime extractorCreateTime += rec._extractorCreateTime marcRecordToDictTime += rec._marcRecordToDictTime extractMethodTime += rec._extractMethodTime if hasattr(rec, "bib_num"): recordBatch.append(rec) lastBibNum = rec.bib_num else: print "not adding record %s; no bib_num present!" % rec if (count % SOLR_INDEX_BATCH_SIZE) == 0: # nb. neither apache commons nor python urllib works right here! Unicode gets mangled. # Must use postURL # fetch the item status info if required. if DO_ITEM_STATUS_INDEXING: bibs = [x.bib_num for x in recordBatch] avail = horizonItemStatus.availableAt(bibs) for x in recordBatch: x.available = avail[x.bib_num] mrserTime = time.time() data = u"".join([x.serialize() for x in recordBatch]) recordBatch = [] marcSerializeTime += time.time() - mrserTime startUpdateTime = time.time() try: resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data) except IOError: print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec." time.sleep(10) resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data) # if it fails again here, we want to just bomb out. if resp.find('<result status="1"') > -1: print "\nError POSTing documents! Response from Solr was\n\n%s\n" % resp # TODO: put in retry/continue code here for failed updates/slowdowns on Solr # TODO: parse result status and do something if there is an error (like print stacktrace) updateTime += time.time() - startUpdateTime if pid > -1: print ("*%d" % pid), else: print "*", if PRINT_SOLR_POST_DATA: print "\n\n<add>%s</add>\n\n" % data data = "" if (count % SOLR_COMMIT_BATCH_SIZE) == 0: try: print "committing..." beginCommitTime = time.time() if nonblocking: print "doing nonblocking commit" solrConnection.commitNonblocking() else: solrConnection.commit() commitTime += time.time() - beginCommitTime except IOError: import time print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec." time.sleep(10) if lastCommitTime: thisBatchRate = (0.0 + SOLR_COMMIT_BATCH_SIZE) / (time.time() - lastCommitTime) overallRate = (0.0 + count) / (time.time() - startTime) if pid > -1: print "\n>>>>>>>>>>>>COMMIT for PID %s<<<<<<<<<<<<<<<\n" % pid # csdebug print "[%s] %s records indexed\t| This Batch: %.4f records/sec|\tOverall: %.4f records/sec" % ( time.ctime(), count, thisBatchRate, overallRate, ) if PROFILE: print """\nfile->MARC: %.4f\nMARC->py: %.4f\npy->XML: %.4f\n""" % ( marcReaderTime, marcRecordForSolrTime, marcSerializeTime, ) print """MARC to dict: %.4f\ncreate extractor: %.4f\nextraction: %.4f\n\textract method: %.4f""" % ( marcRecordToDictTime, extractorCreateTime, extractionTime, extractMethodTime, ) print """Solr Update: %.4f\nSolr Commit: %.4f\n""" % (updateTime, commitTime) lastCommitTime = time.time() if (count % SOLR_OPTIMIZE_BATCH_SIZE) == 0: print "[%s] FORCING OPTIMIZE..." % time.ctime() solrConnection.optimize() print "[%s] OPTIMIZE done" % time.ctime() System.gc() # do last batch here if len(recordBatch) > 0: print "doing final POST" mrserTime = time.time() data = "".join([x.serialize() for x in recordBatch]) recordBatch = [] resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data) if resp.find('<result status="1"') > -1: print "\nError POSTing documents! Response from Solr was\n\n%s\n\n" % resp print "committing..." if nonblocking: solrConnection.commitNonblocking() else: solrConnection.commit() inStream.close() return count
""" from java.lang import * from java.io import * from java.sql import * from java.util import * import sys, urllib, time from loadPropsFile import * from config.solr import * import solrConnection #1. get connection to the database props = loadPropsFile("config/ILSConn.properties") # TODO: decide if this should use splcommons.HorizonConn instead dbType = props['db.type'] if dbType != "sybase": print "database type %s not yet supported." % dbType sys.exit(1) else: connString = "jdbc:jtds:%(db.type)s://%(db.server)s:%(db.port)s/%(db.name)s" % props Class.forName( props['db.driver'] ).newInstance() conn = DriverManager.getConnection( connString, props['db.user'], props['db.password'] ) def updateSolrRecordAvailability( bibNum, availableLocations = [], doPost =1 ): """this function updates an already-indexed record from Solr with new location information.
""" from java.lang import * from java.io import * from java.sql import * from java.util import * import sys, urllib, time from loadPropsFile import * from config.solr import * import solrConnection #1. get connection to the database props = loadPropsFile("config/ILSConn.properties") # TODO: decide if this should use splcommons.HorizonConn instead dbType = props['db.type'] if dbType != "sybase": print "database type %s not yet supported." % dbType sys.exit(1) else: connString = "jdbc:jtds:%(db.type)s://%(db.server)s:%(db.port)s/%(db.name)s" % props Class.forName(props['db.driver']).newInstance() conn = DriverManager.getConnection(connString, props['db.user'], props['db.password']) def updateSolrRecordAvailability(bibNum, availableLocations=[], doPost=1):
def __init__(self, marcRecord, anselUnicodeConverter=None, accession=None, profile=0, propsObject=None, indexerProperties="config/indexer.properties"): start = time.time() self._marcRecordToDictTime = 0 self._extractionTime = 0 self._extractorCreateTime = 0 self._extractMethodTime = 0 if profile: self._marcRecordToDictProfiling = {} if anselUnicodeConverter is None: print "creating ansel -> unicode converter" # csdebug anselUnicodeConverter = AnselToUnicode() if profile: record, _perfData = marcIndexingUtils.marcRecordToDict( marcRecord, anselUnicodeConverter) self._marcRecordToDictProfiling = _perfData else: record = marcIndexingUtils.marcRecordToDict( marcRecord, anselUnicodeConverter) self._marcRecordToDictTime = time.time() - start start = time.time() if not propsObject: config = loadPropsFile(indexerProperties) else: config = propsObject extractor = MarcExtractor(record) self._extractorCreateTime = (time.time() - start) # TODO: decide if this should be turn-offable or not self.marc_record = str(record) fieldsToDo = [x.strip() for x in config['active.fields'].split(",")] _processors = __import__('processors', {}, {}, ['']) for fieldOn in fieldsToDo: start = time.time() processorNameOn = config.get("%s.processor" % fieldOn, "standardProcessor") marcMapOn = config.get("%s.marcMap" % fieldOn, None) # do processing if processorNameOn == "standardProcessor": # then just use the MARC extractor separatorOn = config.get("%s.marcMap.separator" % fieldOn, " ") stripTrailingCommas = int( config.get("%s.stripTrailingCommas" % fieldOn, "0")) if stripTrailingCommas: extractMethodStart = time.time() processedResult = extractor.extract( marcMapOn, separator=separatorOn, trailingPunctuationToStrip=[","], stripTrailingPunctuation=1) self._extractMethodTime += (time.time() - extractMethodStart) else: stripTrailingPunctuation = int( config.get("%s.stripTrailingPunctuation" % fieldOn, "0")) try: extractMethodStart = time.time() processedResult = extractor.extract( marcMapOn, separator=separatorOn, stripTrailingPunctuation=stripTrailingPunctuation) self._extractMethodTime += (time.time() - extractMethodStart) except AttributeError: print "You do not have a correct marc mapping set up for field %s" % fieldOn if ((processedResult == None) or len(processedResult) == 0) and config.has_key( "%s.marcMap.lastResort" % fieldOn): marcMapOn = config.get("%s.marcMap.lastResort" % fieldOn, None) extractMethodStart = time.time() processedResult = extractor.extract(marcMapOn, separator=separatorOn) self._extractMethodTime += (time.time() - extractMethodStart) else: # get and run custom processor processorOn = getattr(_processors, processorNameOn) processedResult = processorOn(record, marcMap=marcMapOn, extractor=extractor) # do post-processing based upon type typeOn = config.get("%s.type" % fieldOn, "multi") if typeOn == "single" and (type(processedResult) == type( [])) and len(processedResult) > 0: postProcessedResult = processedResult[0] elif typeOn == "singleTranslation": if (type(processedResult) == type([])): if len(processedResult) >= 1: processedResult = processedResult[0] else: processedResult = None translationMapName = config.get("%s.translationMap" % fieldOn, None) if translationMapName is not None: _translationMapModule = __import__("config.codes", {}, {}, ['']) _translationMap = getattr(_translationMapModule, translationMapName) postProcessedResult = _translationMap.get( processedResult, None) else: postProcessedResult = processedResult # deal with stripWhitespace after all other text manipulations stripWhitespace = int( config.get("%s.stripWhitespace" % fieldOn, "0")) if stripWhitespace: if type(postProcessedResult) == type(""): postProcessedResult = postProcessedResult.strip() elif type(postProcessedResult) == type([]): postProcessedResult = [ x.strip() for x in postProcessedResult ] # FINALLY, set own attribute if postProcessedResult is not None and len( postProcessedResult) > 0: setattr(self, fieldOn, postProcessedResult) self._extractionTime += (time.time() - start)
def processFile(filename, anselUnicodeConverter=None, nonblocking=0, pid=-1): # if nonblocking == 0 then all commits are blocking; if 1 then they are nonblocking. inStream = FileInputStream(filename) print "processFile>> %s" % filename marcReader = MarcStreamReader(inStream) data = "" count = 0 lastCommitTime = None import time startTime = time.time() lastRecord = None lastBibNum = None m4j = None marcReaderTime = 0 marcRecordToDictTime = 0 extractorCreateTime = 0 extractionTime = 0 extractMethodTime = 0 marcRecordForSolrTime = 0 commitTime = 0 updateTime = 0 marcSerializeTime = 0 accession = 0 # TODO: try and load serialized accession # from somewhere serializedRecord = None recordBatch = [] # get default properties file from loadPropsFile import * props = loadPropsFile(DEFAULT_INDEXING_PROPERTIES_FILE) while marcReader.hasNext() and count < NUM_RECORDS_TO_ADD: #if pid > -1: # print (".%d" % pid), #else: # print ".", # CSDEBUG accession += 1 count += 1 # TODO: improve error handling here (main problem is that Marc4J will fall over # at the sight of a bad record and there's no way to get it to just skip over # a bad record -- so there is little we can do, except better error messages! try: mrTimeStart = time.time() marc4jRecord = marcReader.next() marcReaderTime += (time.time() - mrTimeStart) except: print "last record indexed was bib# %s " % lastBibNum import sys print "sys.exc_info is %s" % str(sys.exc_info()) sys.exit(1) mrsTime = time.time() #try: rec = solrIndexingUtils.recordForSolr(marc4jRecord, anselUnicodeConverter, propsObject=props) #except: # print "exception processing record, skipping" # TODO: error handling # continue marcRecordForSolrTime += (time.time() - mrsTime) extractionTime += rec._extractionTime extractorCreateTime += rec._extractorCreateTime marcRecordToDictTime += rec._marcRecordToDictTime extractMethodTime += rec._extractMethodTime if hasattr(rec, "bib_num"): recordBatch.append(rec) lastBibNum = rec.bib_num else: print "not adding record %s; no bib_num present!" % rec if ((count % SOLR_INDEX_BATCH_SIZE) == 0): # nb. neither apache commons nor python urllib works right here! Unicode gets mangled. #Must use postURL # fetch the item status info if required. if DO_ITEM_STATUS_INDEXING: bibs = [x.bib_num for x in recordBatch] avail = horizonItemStatus.availableAt(bibs) for x in recordBatch: x.available = avail[x.bib_num] mrserTime = time.time() data = u''.join([x.serialize() for x in recordBatch]) recordBatch = [] marcSerializeTime += (time.time() - mrserTime) startUpdateTime = time.time() try: resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data) except IOError: print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec." time.sleep(10) resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data) # if it fails again here, we want to just bomb out. if resp.find('<result status="1"') > -1: print "\nError POSTing documents! Response from Solr was\n\n%s\n" % resp # TODO: put in retry/continue code here for failed updates/slowdowns on Solr # TODO: parse result status and do something if there is an error (like print stacktrace) updateTime += (time.time() - startUpdateTime) if pid > -1: print("*%d" % pid), else: print "*", if PRINT_SOLR_POST_DATA: print "\n\n<add>%s</add>\n\n" % data data = "" if ((count % SOLR_COMMIT_BATCH_SIZE) == 0): try: print "committing..." beginCommitTime = time.time() if nonblocking: print "doing nonblocking commit" solrConnection.commitNonblocking() else: solrConnection.commit() commitTime += (time.time() - beginCommitTime) except IOError: import time print "Connection reset when talking to Solr, skipping this commit and sleeping 10 sec." time.sleep(10) if lastCommitTime: thisBatchRate = ((0.0 + SOLR_COMMIT_BATCH_SIZE) / (time.time() - lastCommitTime)) overallRate = ((0.0 + count) / (time.time() - startTime)) if pid > -1: print "\n>>>>>>>>>>>>COMMIT for PID %s<<<<<<<<<<<<<<<\n" % pid # csdebug print "[%s] %s records indexed\t| This Batch: %.4f records/sec|\tOverall: %.4f records/sec" % ( time.ctime(), count, thisBatchRate, overallRate) if PROFILE: print """\nfile->MARC: %.4f\nMARC->py: %.4f\npy->XML: %.4f\n""" % ( marcReaderTime, marcRecordForSolrTime, marcSerializeTime) print """MARC to dict: %.4f\ncreate extractor: %.4f\nextraction: %.4f\n\textract method: %.4f""" % ( marcRecordToDictTime, extractorCreateTime, extractionTime, extractMethodTime) print """Solr Update: %.4f\nSolr Commit: %.4f\n""" % ( updateTime, commitTime) lastCommitTime = time.time() if ((count % SOLR_OPTIMIZE_BATCH_SIZE) == 0): print "[%s] FORCING OPTIMIZE..." % time.ctime() solrConnection.optimize() print "[%s] OPTIMIZE done" % time.ctime() System.gc() # do last batch here if len(recordBatch) > 0: print "doing final POST" mrserTime = time.time() data = ''.join([x.serialize() for x in recordBatch]) recordBatch = [] resp = solrConnection.postURL(SOLR_UPDATE_URL, "<add>%s</add>" % data) if resp.find('<result status="1"') > -1: print "\nError POSTing documents! Response from Solr was\n\n%s\n\n" % resp print "committing..." if nonblocking: solrConnection.commitNonblocking() else: solrConnection.commit() inStream.close() return count