def matchEbook(self): for source, regex in self.EBOOK_REGEX.items(): self.source = source if re.search(regex, self.uri): # Check if link is accessible (e.g. public domain/open source) if source == 'internetarchive': if self.checkIAStatus() is True: return None linkID = Identifier( identifier='ia.{}'.format(self.identifier), source=None ) elif source == 'hathitrust': self.parseHathiLink() return None else: linkID = Identifier( identifier=self.identifier, source='gutenberg' ) self.instance.addFormat(**{ 'source': source, 'content_type': 'ebook', 'links': [ self.createLink( self.uri, 'text/html', local=False, download=False, images=False, ebook=True ) ], 'identifiers': [linkID] }) return True
def readFromClassify(workXML, workUUID): """Parse Classify XML document into a object that complies with the SFR data model. Accepts a single XML document and returns a WorkRecord.""" logger.debug('Parsing Returned Work') work = workXML.find('.//work', namespaces=NAMESPACE) start = workXML.find('.//start', namespaces=NAMESPACE) oclcTitle = work.get('title') oclcNo = Identifier('oclc', work.text, 1) owiNo = Identifier('owi', work.get('owi'), 1) if OutputManager.checkRecentQueries('lookup/{}/{}/{}'.format( 'owi', work.get('owi'), start.text )) is True: raise DataError('Work {} with OWI {} already classified'.format( workUUID, work.get('owi') )) measurements = [] for measure in ['editions', 'holdings', 'eholdings']: measurements.append(Measurement( measure, work.get(measure), 1, MEASUREMENT_TIME, work.text )) authors = workXML.findall('.//author', namespaces=NAMESPACE) authorList = list(map(parseAuthor, authors)) editions = workXML.findall('.//edition', namespaces=NAMESPACE) editionList = loadEditions(editions) headings = workXML.findall('.//heading', namespaces=NAMESPACE) headingList = list(map(parseHeading, headings)) workDict = { 'title': oclcTitle, 'agents': authorList, 'instances': editionList, 'subjects': headingList, 'identifiers': [ oclcNo, owiNo ], 'measurements': measurements } instanceCount = int(work.get('editions', 0)) return WorkRecord.createFromDict(**workDict), instanceCount, work.text
def matchEbook(self): for source, regex in self.EBOOK_REGEX.items(): self.source = source if re.search(regex, self.uri): if source == 'internetarchive': if self.checkIAStatus() is True: return None elif source == 'hathitrust': self.parseHathiLink() return None self.instance.addFormat( **{ 'source': source, 'content_type': 'ebook', 'links': [ self.createLink(self.uri, 'text/html', local=False, download=False, images=False, ebook=True) ], 'identifiers': [ Identifier(identifier=self.identifier, source='hathi') ] }) return True
def getNewItemLinks(self, recItem): if recItem.get('rightsCode', 'ic') in ['ic', 'icus', 'ic-world', 'und']: return redirectURL = requests.head(recItem['itemURL']) realURL = redirectURL.headers['Location'].replace('https://', '') hathiID = re.search(self.HATHI_ID_REGEX, realURL).group(1) downloadURL = self.HATHI_DOWNLOAD_URL.format(hathiID) return { 'source': self.source, 'content_type': 'ebook', 'links': [ HoldingParser.createLink(realURL, 'text/html', local=False, download=False, images=True, ebook=False), HoldingParser.createLink(downloadURL, 'application/pdf', local=False, download=True, images=True, ebook=False) ], 'identifiers': [Identifier(identifier=hathiID, source='hathi')] }
def buildWork(self): """Construct the SFR Work object from the Hathi data""" self.work.title = self.ingest['title'] self.work.series = self.ingest['description'] logger.info('Creating work record for {}'.format(self.work.title)) # The primary identifier for this work is a HathiTrust bib reference self.work.primary_identifier = Identifier( type='hathi', identifier=self.ingest['bib_key'], weight=1) logger.debug('Setting primary_identifier to {}'.format( self.work.primary_identifier)) for idType, key in HathiRecord.identifierFields: logger.debug('Setting identifiers {}'.format(idType)) self.parseIdentifiers(self.work, idType, key) # All government documents should be in the public_domain. self.parseGovDoc(self.ingest['gov_doc']) # The copyright date assigned to the work by HathiTrust self.work.addClassItem( 'dates', Date, **{ 'display_date': self.ingest['copyright_date'], 'date_range': self.ingest['copyright_date'], 'date_type': 'copyright_date' }) logger.debug('Setting copyright date to {}'.format( self.ingest['copyright_date'])) self.parseAuthor(self.ingest['author'])
def parseClassification(classification): """Parse a classification into an identifier for the work record.""" tag = classification.get('tag') subjectType = MARC_FIELDS[tag] classDict = { 'type': subjectType, 'identifier': classification.get('sfa'), 'weight': 1 } return Identifier.createFromDict(**classDict)
def transformMARC(record, marcRels): """Accepts a marcalyx object and transforms the MARC record into a SFR data object. """ doabID = record[0] dateIssued = record[1] marcRecord = record[2] logger.info('Transforming record {} into a SFR object'.format(doabID)) work = WorkRecord() instance = InstanceRecord() item = Format(source='doab', contentType='ebook') # Add issued date to work record work.addClassItem( 'dates', Date, **{ 'display_date': dateIssued, 'date_range': dateIssued, 'date_type': 'issued' }) # All DOAB records have the same CreativeCommons license, assign this # to Instance/Item records rights = Rights( source='doab', license='https://creativecommons.org/licenses/by-nc-nd/4.0/', statement= 'Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International' ) instance.rights.append(rights) item.rights.append(rights) # A single DOAB identifier can be assigned to the work/instance/item records doabIdentifier = Identifier(type='doab', identifier=doabID, weight=1) work.identifiers.append(doabIdentifier) instance.identifiers.append(doabIdentifier) item.identifiers.append(doabIdentifier) # Code Fields (Identifiers) logger.debug('Parsing 0X0-0XX Fields') controlData = [('010', 'identifiers', 'a', 'lccn'), ('020', 'identifiers', 'a', 'isbn'), ('022', 'identifiers', 'a', 'issn'), ('050', 'identifiers', 'a', 'lcc'), ('082', 'identifiers', 'a', 'ddc'), ('010', 'identifiers', 'z', 'lccn'), ('020', 'identifiers', 'z', 'isbn'), ('022', 'identifiers', 'z', 'issn'), ('050', 'identifiers', 'z', 'lcc'), ('082', 'identifiers', 'z', 'ddc')] for field in controlData: extractSubfieldValue(marcRecord, work, field) extractSubfieldValue(marcRecord, instance, field) # Author/Creator Fields logger.debug('Parsing 100, 110 & 111 Fields') agentData = ['100', '110', '111', '700', '710', '711'] for agentField in agentData: extractAgentValue(marcRecord, work, agentField, marcRels) # Title Fields logger.debug('Parsing 21X-24X Fields') titleData = [('210', 'alt_titles', 'a'), ('222', 'alt_titles', 'a'), ('242', 'alt_titles', 'a'), ('246', 'alt_titles', 'a'), ('247', 'alt_titles', 'a'), ('245', 'title', 'a'), ('245', 'sub_title', 'b')] for field in titleData: extractSubfieldValue(marcRecord, work, field) extractSubfieldValue(marcRecord, instance, field) # Edition Fields logger.debug('Parsing Edition (250 & 260) Fields') editionData = [('250', 'edition_statement', 'a'), ('250', 'edition_statement', 'b'), ('260', 'pub_place', 'a'), ('260', 'pub_date', 'c'), ('260', 'agents', 'b', 'publisher'), ('260', 'agents', 'f', 'manufacturer'), ('264', 'copyright_date', 'c')] for field in editionData: extractSubfieldValue(marcRecord, instance, field) # Physical Details # TODO Load fields into items/measurements? logger.debug('Parsing Extent (300) Field') extentData = [('300', 'extent', 'a'), ('300', 'extent', 'b'), ('300', 'extent', 'c'), ('300', 'extent', 'e'), ('300', 'extent', 'f')] for field in extentData: extractSubfieldValue(marcRecord, instance, field) # Series Details logger.debug('Parsing Series (490) Field') seriesData = [('490', 'series', 'a'), ('490', 'series_position', 'v')] for field in seriesData: extractSubfieldValue(marcRecord, work, field) # Notes/Description details # TODO What fields should we bring in? logger.debug('Parsing TOC (505) Field') tocData = [('505', 'table_of_contents', 'a'), ('520', 'summary', 'a')] for field in tocData: extractSubfieldValue(marcRecord, instance, field) # Language Fields if len(marcRecord['546']) > 0: for lang in marcRecord['546'][0].subfield('a'): langs = re.split(r'/|\|', lang.value) for language in langs: logger.debug( 'Adding language {} to work and instance'.format(language)) langObj = pycountry.languages.get(name=language.strip()) if langObj is None or langObj.alpha_3 == 'und': logger.warning( 'Unable to parse language {}'.format(language)) continue sfrLang = Language(language=language, iso_2=langObj.alpha_2, iso_3=langObj.alpha_3) work.language.append(sfrLang) instance.language.append(sfrLang) # Subject Details logger.debug('Parsing 6XX Subject Fields') subjectData = ['600', '610', '648', '650', '651', '655', '656', '657'] for subjectType in subjectData: extractSubjects(marcRecord, work, subjectType) # Eletronic Holding Details logger.debug('Parsing 856 (Electronic Holding) Field') extractHoldingsLinks(marcRecord['856'], instance, item) # TODO Load data for these fields # 76X-78X # 80X-83X instance.formats.append(item) work.instances.append(instance) return work, doabID
def enhanceRecord(record): """Takes a single input record and retrieves data from the OCLC Classify service. Manages the overall workflow of the function.""" try: workUUID = record['uuid'] searchType = record['type'] searchFields = record['fields'] startPos = record.get('start', 0) except KeyError as e: logger.error('Missing attribute in data block!') logger.debug(e) raise DataError('Required attribute missing from data block') except TypeError as e: logger.error('Could not read data from source') logger.debug(e) raise DataError('Kinesis data contains non-dictionary value') logger.info('Starting to enhance work record {}'.format(workUUID)) try: # Step 1: Generate a set of XML records retrieved from Classify # This step also adds the oclc identifiers to the sourceData record classifyData = classifyRecord(searchType, searchFields, workUUID, start=startPos) # Step 2: Parse the data recieved from Classify into the SFR data model classifiedWork, instanceCount, oclcNo = readFromClassify( classifyData, workUUID) logger.debug('Instances found {}'.format(instanceCount)) if instanceCount > 500: iterStop = startPos + instanceCount if instanceCount > 1500: iterStop = startPos + 1500 for i in range(startPos + 500, iterStop, 500): classifyPage = classifyRecord(searchType, searchFields, workUUID, start=i) extractAndAppendEditions(classifiedWork, classifyPage) if instanceCount > startPos + 1500: OutputManager.putQueue( { 'type': 'identifier', 'uuid': workUUID, 'fields': { 'idType': 'oclc', 'identifier': oclcNo, 'start': startPos + 1500 } }, os.environ['CLASSIFY_QUEUE']) # This sets the primary identifier for processing by the db manager classifiedWork.primary_identifier = Identifier('uuid', workUUID, 1) # Step 3: Output this block to kinesis outputObject = { 'status': 200, 'type': 'work', 'method': 'update', 'data': classifiedWork } while len(classifiedWork.instances) > 100: instanceChunk = classifiedWork.instances[0:100] del classifiedWork.instances[0:100] OutputManager.putKinesis( { 'status': 200, 'type': 'work', 'method': 'update', 'data': { 'instances': instanceChunk, 'primary_identifier': Identifier('uuid', workUUID, 1) } }, os.environ['OUTPUT_KINESIS'], workUUID) OutputManager.putKinesis(outputObject, os.environ['OUTPUT_KINESIS'], workUUID) except OCLCError as err: logger.error('OCLC Query for work {} failed with message: {}'.format( workUUID, err.message)) raise err return True
def parseEdition(edition): """Parse an edition into a Instance record""" oclcIdentifier = edition.get('oclc') oclcNo = Identifier( 'oclc', oclcIdentifier, 1 ) identifiers = [ oclcNo ] fullEditionRec = None if OutputManager.checkRecentQueries('lookup/{}/{}'.format('oclc', oclcIdentifier)) is False: try: logger.info('Querying OCLC lookup for {}'.format(oclcIdentifier)) oclcRoot = 'https://dev-platform.nypl.org/api/v0.1/research-now/v3/utils/oclc-catalog' oclcQuery = '{}?identifier={}&type={}'.format( oclcRoot, oclcIdentifier, 'oclc' ) edResp = requests.get(oclcQuery, timeout=10) if edResp.status_code == 200: logger.debug('Found matching OCLC record') fullEditionRec = edResp.json() except Exception as err: logger.debug('Error received when querying OCLC catalog') logger.error(err) classifications = edition.findall('.//class', namespaces=NAMESPACE) classificationList = list(map(parseClassification, classifications)) identifiers.extend(classificationList) holdings = Measurement( 'holdings', edition.get('holdings'), 1, MEASUREMENT_TIME, oclcIdentifier ) digHoldings = Measurement( 'digitalHoldings', edition.get('eholdings'), 1, MEASUREMENT_TIME, oclcIdentifier ) language = edition.get('language') editionTitle = edition.get('title') editionDict = { 'title': editionTitle, 'language': language, 'identifiers': identifiers, 'measurements': [ holdings, digHoldings ] } if fullEditionRec is not None: outEdition = fullEditionRec outEdition['title'] = editionDict['title'] outEdition['identifiers'].extend(editionDict['identifiers']) outEdition['measurements'].extend(editionDict['measurements']) outEdition['language'] = list(set( [outEdition['language'], editionDict['language']] )) else: outEdition = editionDict return InstanceRecord.createFromDict(**outEdition)
def test_identifier_repr(self): idenTest = Identifier(type='test', identifier='1') self.assertEqual(str(idenTest), '<Identifier(type=test, id=1)>')
def test_identifier_create(self): idenTest = Identifier() self.assertIsInstance(idenTest, Identifier)