def parseRecord(encodedRec, outManager): """Parse an individual record. Verifies that an object was able to be decoded from the input base64 encoded string and if so, hands this to the enhancer method""" try: record = json.loads(encodedRec['body']) except json.decoder.JSONDecodeError as jsonErr: logger.error('Invalid JSON block received') logger.error(jsonErr) raise DataError('Malformed JSON block received from SQS') except KeyError as err: logger.error('Missing body attribute in SQS message') logger.debug(err) raise DataError('Body object missing from SQS message') logger.info('Storing cover from {}'.format(record['url'])) coverParser = CoverParse(record) coverParser.storeCover() outManager.putKinesis( { 'originalURL': coverParser.remoteURL.lower(), 'storedURL': coverParser.s3CoverURL }, os.environ['DB_UPDATE_STREAM'], recType='cover') return coverParser.s3CoverURL
def parseRecord(encodedRec): """Parse an individual record. Verifies that an object was able to be decoded from the input base64 encoded string and if so, hands this to the enhancer method""" try: record = json.loads(encodedRec['body']) logger.info('Creating editions for work {}'.format( record['identifier'] )) try: clustManager = ClusterManager(record, MANAGER) clustManager.clusterInstances() clustManager.deleteExistingEditions() clustManager.storeEditions() except Exception as err: # noqa: Q000 # There are a large number of SQLAlchemy errors that can be thrown # These should be handled elsewhere, but this should catch anything # and rollback the session if we encounter something unexpected MANAGER.session.rollback() # Rollback current record only logger.error('Failed to store record {}'.format( record['identifier'] )) logger.debug(err) logger.debug(traceback.format_exc()) return ('failure', '{}|{}'.format( clustManager.work.uuid, clustManager.work.title )) session = MANAGER.createSession() session.add(clustManager.work) esManager = ElasticManager(clustManager.work) esManager.enhanceWork() esManager.saveWork() session.close() return ('success', '{}|{}'.format( clustManager.work.uuid, clustManager.work.title )) except json.decoder.JSONDecodeError as jsonErr: logger.error('Invalid JSON block received') logger.error(jsonErr) raise DataError('Malformed JSON block received from SQS') except KeyError as err: logger.error('Missing body attribute in SQS message') logger.debug(err) raise DataError('Body object missing from SQS message')
def addClassItem(self, listAttrib, classType, **identifierDict): if listAttrib not in dir(self): raise DataError('Field {} not valid for {}'.format( listAttrib, self.__class__.__name__ )) self[listAttrib].append(classType.createFromDict(**identifierDict))
def test_record_parse_write_err(self, mockManager, mockSession): testRec = base64.b64encode(json.dumps({ 'status': 200, 'data': 'data' }).encode('utf-8')) mockManager.importRecord.side_effect = DataError('test err') res = self.parseRecord({'kinesis': {'data': testRec}}, mockManager) self.assertNotEqual(res, True)
def parseRecord(encodedRec): """Parse an individual record. Verifies that an object was able to be decoded from the input base64 encoded string and if so, hands this to the enhancer method""" try: record = json.loads(encodedRec['body']) return enhanceRecord(record) except json.decoder.JSONDecodeError as jsonErr: logger.error('Invalid JSON block recieved') logger.error(jsonErr) raise DataError('Malformed JSON block recieved from SQS') except KeyError as err: logger.error('Missing body attribute in SQS message') logger.debug(err) raise DataError('Body object missing from SQS message') except (DataError, OCLCError) as err: logger.error(err.message) return False
def readFromClassify(workXML, workUUID): """Parse Classify XML document into a object that complies with the SFR data model. Accepts a single XML document and returns a WorkRecord.""" logger.debug('Parsing Returned Work') work = workXML.find('.//work', namespaces=NAMESPACE) start = workXML.find('.//start', namespaces=NAMESPACE) oclcTitle = work.get('title') oclcNo = Identifier('oclc', work.text, 1) owiNo = Identifier('owi', work.get('owi'), 1) if OutputManager.checkRecentQueries('lookup/{}/{}/{}'.format( 'owi', work.get('owi'), start.text )) is True: raise DataError('Work {} with OWI {} already classified'.format( workUUID, work.get('owi') )) measurements = [] for measure in ['editions', 'holdings', 'eholdings']: measurements.append(Measurement( measure, work.get(measure), 1, MEASUREMENT_TIME, work.text )) authors = workXML.findall('.//author', namespaces=NAMESPACE) authorList = list(map(parseAuthor, authors)) editions = workXML.findall('.//edition', namespaces=NAMESPACE) editionList = loadEditions(editions) headings = workXML.findall('.//heading', namespaces=NAMESPACE) headingList = list(map(parseHeading, headings)) workDict = { 'title': oclcTitle, 'agents': authorList, 'instances': editionList, 'subjects': headingList, 'identifiers': [ oclcNo, owiNo ], 'measurements': measurements } instanceCount = int(work.get('editions', 0)) return WorkRecord.createFromDict(**workDict), instanceCount, work.text
def createFromDict(cls, **kwargs): """Take a standard dict object and convert to an instance of the provided class. Allows for creation of new instances with arbitrary fields set""" record = cls() for field, value in kwargs.items(): if field not in dir(record): raise DataError('Field {} not valid for {}'.format( field, cls.__name__)) record[field] = value return record
def clusterInstances(self): session = self.dbManager.createSession() self.work = self.fetchWork(session) self.logger.info('Creating editions for {}'.format(self.work)) if len(self.work.instances) < 1: raise DataError('Work Record has no attached instance Records') mlModel = KModel(self.work.instances) mlModel.createDF() session.close() mlModel.generateClusters() self.editions = mlModel.parseEditions()
def buildDataModel(self, countryCodes): logger.debug('Generating work record for bib record {}'.format( self.ingest['bib_key'] )) # If we don't have a valid rights code, this means that the row has # been improperly formatted (generally fields out of order/misplaced) # Raise a warning but continue if this is found to be true if self.ingest['rights_statement'] not in HathiRecord.rightsReasons: raise DataError( '{} is malformed (columns missing or incorrect'.format( self.ingest['htid'] ) ) self.buildWork() logger.debug('Generating instance record for hathi record {}'.format( self.ingest['htid'] )) self.buildInstance(countryCodes) logger.debug('Generating an item record for hathi record {}'.format( self.ingest['htid'] )) self.buildItem() logger.debug('Generate a rights object for the associated rights statement {}'.format( self.ingest['rights'] )) # Generate a stand-alone rights object that contains the hathi # generated rights information self.createRights() for agent in self.work.agents: self.getVIAF(agent) for instance in self.work.instances: for agent in instance.agents: self.getVIAF(agent) for item in instance.formats: for agent in item.agents: self.getVIAF(agent)
def generateIdentifierURL(self): """Creates a query based of an identifier and its type. If either field is missing for this request, default to an author/title search. """ if self.recID is not None and self.recType is not None: if self.recType not in QueryManager.LOOKUP_IDENTIFIERS: raise DataError( 'Unrecognized/invalid identifier type {} recieved'.format( self.recType ) ) self.query = "{}?{}={}".format( QueryManager.CLASSIFY_ROOT, self.recType, self.recID ) self.addClassifyOptions() else: self.generateAuthorTitleURL()
def generateAuthorTitleURL(self): """Generates an author/title query for Classify. Raises: DataError: Raised if no author is received, which can cause unexpectedly large results to be returned for a query. """ if self.author is None or self.title is None: raise DataError('Author and title required for search') self.cleanTitle() titleAuthorParam = 'title={}&author={}'.format(self.title, self.author) self.query = "{}?{}".format( QueryManager.CLASSIFY_ROOT, titleAuthorParam ) self.addClassifyOptions()
def parseHoldingURI(uri): logger.info('Loading URI {}'.format(uri)) try: uriHead = requests.head(uri, allow_redirects=False) headers = uriHead.headers except (MissingSchema, ConnectionError, InvalidURL): raise DataError('Invalid Holding URL') if uriHead.status_code in [301, 302, 307, 308]: redirectTo = headers['Location'] logger.debug('Found {} Redirect to {}'.format(uriHead.status_code, redirectTo)) return parseHoldingURI(redirectTo) try: contentType = headers['Content-Type'] except KeyError: logger.warning('Unable to find header Content-Type for {}'.format(uri)) contentType = 'text/html' return uri, contentType
def loadMARCRelators(self): """DOAB identifies contributors to its records using the MARC Relator codes. These are not available in a library anywhere and as a result these must be translated to human-readable formats. This parses the LoC's provided XML file into a dictionary of translated codes. """ relRes = requests.get(self.relators_file) if relRes.status_code != 200: logger.error('Failed to load MARC21 Relator Authority') logger.debug(relRes.text) raise DataError('Unable to load necessary MARC21 Authority') relJSON = json.loads(relRes.content) terms = {} rdfLabel = 'http://www.loc.gov/mads/rdf/v1#authoritativeLabel' for rel in relJSON: try: code = rel['@id'].split('/')[-1] terms[code] = rel[rdfLabel][0]['@value'] except KeyError: continue return terms
def test_row_parse_data_error(self, mock_hathi): mock_hathi().buildDataModel.side_effect = DataError('Test Error') with self.assertRaises(ProcessingError): rowParser(['row1'], ['htid'], {})
def enhanceRecord(record): """Takes a single input record and retrieves data from the OCLC Classify service. Manages the overall workflow of the function.""" try: workUUID = record['uuid'] searchType = record['type'] searchFields = record['fields'] startPos = record.get('start', 0) except KeyError as e: logger.error('Missing attribute in data block!') logger.debug(e) raise DataError('Required attribute missing from data block') except TypeError as e: logger.error('Could not read data from source') logger.debug(e) raise DataError('Kinesis data contains non-dictionary value') logger.info('Starting to enhance work record {}'.format(workUUID)) try: # Step 1: Generate a set of XML records retrieved from Classify # This step also adds the oclc identifiers to the sourceData record classifyData = classifyRecord(searchType, searchFields, workUUID, start=startPos) # Step 2: Parse the data recieved from Classify into the SFR data model classifiedWork, instanceCount, oclcNo = readFromClassify( classifyData, workUUID) logger.debug('Instances found {}'.format(instanceCount)) if instanceCount > 500: iterStop = startPos + instanceCount if instanceCount > 1500: iterStop = startPos + 1500 for i in range(startPos + 500, iterStop, 500): classifyPage = classifyRecord(searchType, searchFields, workUUID, start=i) extractAndAppendEditions(classifiedWork, classifyPage) if instanceCount > startPos + 1500: OutputManager.putQueue( { 'type': 'identifier', 'uuid': workUUID, 'fields': { 'idType': 'oclc', 'identifier': oclcNo, 'start': startPos + 1500 } }, os.environ['CLASSIFY_QUEUE']) # This sets the primary identifier for processing by the db manager classifiedWork.primary_identifier = Identifier('uuid', workUUID, 1) # Step 3: Output this block to kinesis outputObject = { 'status': 200, 'type': 'work', 'method': 'update', 'data': classifiedWork } while len(classifiedWork.instances) > 100: instanceChunk = classifiedWork.instances[0:100] del classifiedWork.instances[0:100] OutputManager.putKinesis( { 'status': 200, 'type': 'work', 'method': 'update', 'data': { 'instances': instanceChunk, 'primary_identifier': Identifier('uuid', workUUID, 1) } }, os.environ['OUTPUT_KINESIS'], workUUID) OutputManager.putKinesis(outputObject, os.environ['OUTPUT_KINESIS'], workUUID) except OCLCError as err: logger.error('OCLC Query for work {} failed with message: {}'.format( workUUID, err.message)) raise err return True
def parseMessage(self, record): self.idType = record.get('type', 'uuid') self.identifier = record.get('identifier', None) if self.identifier is None: self.logger.error('Missing identifier from SQS message') raise DataError('Missing identifier for invocation')
def parseRecord(encodedRec, updater): """Handles each individual record by parsing JSON from the base64 encoded string recieved from the Kinesis stream, creating a database session and inserting/updating the database to reflect this new data source. It will rollback changes if an error is encountered """ try: record = json.loads(base64.b64decode(encodedRec['kinesis']['data'])) statusCode = record['status'] if statusCode != 200: if statusCode == 204: logger.info('No updates received') raise NoRecordsReceived( 'No records received from {}'.format(record['source']), record ) else: logger.error('Received error from pipeline') logger.debug(record) raise DataError('Received non-200 status code') except json.decoder.JSONDecodeError as jsonErr: logger.error('Invalid JSON block received') logger.error(jsonErr) raise DataError('Invalid JSON block') except (UnicodeDecodeError, binascii.Error) as b64Err: logger.error('Invalid data found in base64 encoded block') logger.debug(b64Err) raise DataError('Error in base64 encoding of record') outRec = None try: MANAGER.startSession() # Start transaction outRec = updater.importRecord(deepcopy(record)) MANAGER.commitChanges() except OperationalError as opErr: logger.error('Conflicting updates caused deadlock, retry') logger.debug(opErr) OutputManager.putKinesis( record.get('data'), os.environ['UPDATE_STREAM'], recType=record.get('type', 'work'), ) MANAGER.session.rollback() # Rollback current record only except except IntegrityError as intErr: logger.error('Unique constraint violated, retry') logger.debug(intErr) OutputManager.putKinesis( record.get('data'), os.environ['UPDATE_STREAM'], recType=record.get('type', 'work'), ) MANAGER.session.rollback() # Rollback current record only except Exception as err: # noqa: Q000 # There are a large number of SQLAlchemy errors that can be thrown # These should be handled elsewhere, but this should catch anything # and rollback the session if we encounter something unexpected logger.error('Failed to store record') logger.debug(err) logger.debug(traceback.format_exc()) MANAGER.session.rollback() # Rollback current record only return outRec
def test_DataError(self): testDataError = DataError('testMessage') assert testDataError.message == 'testMessage'
class TestHandler(unittest.TestCase): @patch.multiple(SessionManager, generateEngine=DEFAULT, decryptEnvVar=DEFAULT) def setUp(self, generateEngine, decryptEnvVar): from service import handler, parseRecords, parseRecord self.handler = handler self.parseRecords = parseRecords self.parseRecord = parseRecord @patch('service.parseRecords', return_value=True) def test_handler_clean(self, mock_parse): testRec = { 'source': 'Kinesis', 'Records': [{ 'kinesis': { 'data': 'data' } }] } resp = self.handler(testRec, None) self.assertTrue(resp) def test_handler_error(self): testRec = {'source': 'Kinesis', 'Records': []} try: self.handler(testRec, None) except NoRecordsReceived: pass self.assertRaises(NoRecordsReceived) def test_records_none(self): testRec = {'source': 'Kinesis'} try: self.handler(testRec, None) except NoRecordsReceived: pass self.assertRaises(NoRecordsReceived) @patch('service.parseRecord', side_effect=[1, 2, 3]) @patch('service.MANAGER') @patch('service.DBUpdater') def test_parseRecords_success(self, mockUpdater, mockManager, mockParse): recResults = self.parseRecords(['rec1', 'rec2', 'rec3']) self.assertEqual(recResults, [1, 2, 3]) mockManager.closeConnection.assert_called_once() self.assertEqual(mockParse.call_count, 3) @patch('service.parseRecord', side_effect=[1, DataError('testing'), 3]) @patch('service.MANAGER') def test_parseRecords_error(self, mockManager, mockParse): recResults = self.parseRecords(['rec1', 'rec2', 'rec3']) self.assertEqual(recResults[0], 1) self.assertEqual(len(recResults), 1) mockManager.closeConnection.assert_called_once() @patch('service.MANAGER') def test_parseRecord_success(self, mockManager): encStr = b64encode( json.dumps({ 'status': 200, 'source': 'testing' }).encode('utf-8')) testRecord = {'kinesis': {'data': encStr}} mockUpdater = MagicMock() mockUpdater.importRecord.return_value = 'import_record' importRec = self.parseRecord(testRecord, mockUpdater) mockManager.startSession.assert_called_once() mockManager.commitChanges.assert_called_once() self.assertEqual(importRec, 'import_record') @patch('service.MANAGER') def test_parseRecord_dbErr(self, mockManager): encStr = b64encode( json.dumps({ 'status': 200, 'source': 'testing' }).encode('utf-8')) testRecord = {'kinesis': {'data': encStr}} mockUpdater = MagicMock() mockUpdater.importRecord.side_effect = OperationalError importRec = self.parseRecord(testRecord, mockUpdater) mockManager.startSession.assert_called_once() mockManager.commitChanges.assert_not_called() mockManager.session.rollback.assert_called_once() self.assertEqual(importRec, None) def test_parseRecord_noRecordsErr(self): encStr = b64encode( json.dumps({ 'status': 204, 'source': 'testing' }).encode('utf-8')) testRecord = {'kinesis': {'data': encStr}} with self.assertRaises(NoRecordsReceived): self.parseRecord(testRecord, 'updater') def test_parseRecord_otherRecordErr(self): encStr = b64encode( json.dumps({ 'status': 500, 'source': 'testing' }).encode('utf-8')) testRecord = {'kinesis': {'data': encStr}} with self.assertRaises(DataError): self.parseRecord(testRecord, 'updater') def test_parseRecord_jsonErr(self): encStr = b64encode('{"bad: "json"}'.encode('utf-8')) testRecord = {'kinesis': {'data': encStr}} with self.assertRaises(DataError): self.parseRecord(testRecord, 'updater') def test_parseRecord_b64Err(self): encStr = json.dumps({'bad': 'base64'}).encode('utf-8') testRecord = {'kinesis': {'data': encStr}} with self.assertRaises(DataError): self.parseRecord(testRecord, 'updater')