if record.metadata.get_element('.//{0}type'.format(dc)): sourceResource['type'] = record.metadata.get_element( './/{0}type'.format(dc), delimiter=';') # webResource.fileFormat # aggregation.dataProvider data_provider = dprovide # aggregation.intermediateProvider # aggregation.isShownAt # aggregation.preview try: preview = assets.thumbnail_service(PURL_match, tn) except UnboundLocalError as err: # logging.warning('aggregation.preview: {0} - {1}'.format(err, oai_id)) print(err, oai_id) continue # aggregation.provider try: docs.append({ "@context": "http://api.dp.la/items/context", "sourceResource": sourceResource, "aggregatedCHO": "#sourceResource", "dataProvider": data_provider, "isShownAt": PURL_match, "preview": preview,
def SSDN_QDC(file_in, tn, dprovide, iprovide=None): with open(file_in, encoding='utf-8') as data_in: logger = assets.CSVLogger('SSDN_QDC', provider=dprovide) records = OAIReader(data_in) docs = [] for record in records: # deleted record handling for repox try: if 'deleted' in record.attrib.keys(): if record.attrib['deleted'] == 'true': continue except AttributeError: pass # deleted record handling for OAI-PMH try: if 'status' in record.find('./{*}header').attrib.keys(): if record.find( './{*}header').attrib['status'] == 'deleted': continue except AttributeError: pass oai_id = record.oai_urn if VERBOSE: print(oai_id) logger.debug(oai_id) sourceResource = {} # sourceResource.alternative alt_title = record.metadata.get_element( './/{0}alternative'.format(dcterms)) if alt_title: sourceResource['alternative'] = alt_title # sourceResource.collection if record.metadata.get_element('.//{0}isPartOf'.format(dcterms)): sourceResource['collection'] = record.metadata.get_element( './/{0}isPartOf'.format(dcterms)) # sourceResource.contributor if record.metadata.get_element('.//{0}contributor'.format(dc)): sourceResource['contributor'] = [{ "name": name } for name in record.metadata.get_element( './/{0}contributor'.format(dc), delimiter=';')] # sourceResource.creator if record.metadata.get_element('.//{0}creator'.format(dc)): sourceResource['creator'] = [] for name in record.metadata.get_element( './/{0}creator'.format(dc), delimiter=';'): # need to test for ( Contributor ) and ( contributor ) if (len(name) > 0) and ("ontributor )" not in name): sourceResource['creator'].append( {"name": name.strip(" ")}) elif "ontributor )" in name: if 'contributor' not in sourceResource.keys(): sourceResource['contributor'] = [] sourceResource['contributor'].append({ "name": name.strip(" ").rstrip("( Contributor )"). rstrip("( contributor )") }) else: sourceResource['contributor'].append({ "name": name.strip(" ").rstrip("( Contributor )"). rstrip("( contributor )") }) # sourceResource.date date = record.metadata.get_element('.//{0}created'.format(dcterms)) if date is None: # TODO: there has to be a better way to do this date = record.metadata.get_element( './/{0}issued'.format(dcterms)) if date is None: date = record.metadata.get_element( './/{0}date'.format(dcterms)) if date is None: date = record.metadata.get_element('.//{0}date'.format(dc)) if date is None: date = record.metadata.get_element( './/{0}available'.format(dcterms)) if date is None: date = record.metadata.get_element( './/{0}dateAccepted'.format(dcterms)) if date is None: date = record.metadata.get_element( './/{0}dateCopyrighted'.format(dcterms)) if date is None: date = record.metadata.get_element( './/{0}dateSubmitted'.format(dcterms)) if date is not None: sourceResource['date'] = { "begin": date[0], "end": date[0], "displayDate": date[0] } # sourceResource.description description = [] if record.metadata.get_element( './/{0}description'.format(dc)) is not None: for item in record.metadata.get_element( './/{0}description'.format(dc)): description.append(item) if record.metadata.get_element( './/{0}abstract'.format(dcterms)) is not None: for item in record.metadata.get_element( './/{0}abstract'.format(dcterms)): description.append(item) if description: sourceResource['description'] = description # sourceResource.extent if record.metadata.get_element('.//{0}extent'.format(dcterms)): sourceResource['extent'] = record.metadata.get_element( './/{0}extent'.format(dcterms), delimiter=';') # sourceResource.format if record.metadata.get_element('.//{0}medium'.format(dcterms)): sourceResource['format'] = [] for element in record.metadata.get_element( './/{0}medium'.format(dcterms), delimiter=';'): if element.lower() in IANA_type_list: file_format = element.lower() pass elif len(element) > 0: sourceResource['format'].append( {'name': element.strip(' ')}) if len(sourceResource['format']) == 0: del sourceResource['format'] # sourceResource.genre # sourceResource.identifier sourceResource['identifier'] = oai_id # sourceResource.language if record.metadata.get_element('.//{0}language'.format(dc)): sourceResource['language'] = [] for element in record.metadata.get_element( './/{0}language'.format(dc), delimiter=';'): if len(element) > 3: sourceResource['language'].append({"name": element}) else: sourceResource['language'].append( {"iso_639_3": element}) # sourceResource.place : sourceResource['spatial'] if record.metadata.get_element('.//{0}spatial'.format(dcterms)): for place in record.metadata.get_element( './/{0}spatial'.format(dcterms), delimiter=';'): try: float(place) except ValueError: sourceResource['spatial'] = [place] # sourceResource.publisher publisher = record.metadata.get_element( './/{0}publisher'.format(dc)) if publisher: sourceResource['publisher'] = publisher # sourceResource.relation # sourceResource.isReplacedBy # sourceResource.replaces # sourceResource.rights rights_uri = re.compile('http://rightsstatements') if record.metadata.get_element('.//{0}rights'.format(dc)): for rights_statement in record.metadata.get_element( './/{0}rights'.format(dc)): uri = rights_uri.search(rights_statement) if uri: sourceResource['rights'] = [{ "@id": uri.string.strip() }] break else: sourceResource['rights'] = [{ "text": rights_statement.strip() }] else: logger.error('No sourceResource.rights - {0}'.format(oai_id)) continue # sourceResource.subject if record.metadata.get_element('.//{0}subject'.format(dc)): sourceResource['subject'] = [{ "name": name } for name in record.metadata.get_element( './/{0}subject'.format(dc), delimiter=';')] # sourceResource.temporal temporal = record.metadata.get_element( './/{0}temporal'.format(dcterms)) if temporal: sourceResource['temporal'] = temporal # sourceResource.title title = record.metadata.get_element('.//{0}title'.format(dc)) if title is not None: sourceResource['title'] = title else: logger.error('No sourceResource.title - {0}'.format(oai_id)) continue # sourceResource.type if record.metadata.get_element('.//{0}type'.format(dc)): sourceResource['type'] = record.metadata.get_element( './/{0}type'.format(dc), delimiter=';') # webResource.fileFormat # TODO: file_format kicked out of SR.genre # aggregation.dataProvider data_provider = dprovide # aggregation.intermediateProvider # aggregation.isShownAt # aggregation.preview preview = None for identifier in record.metadata.get_element( './/{0}identifier'.format(dc)): if 'http' in identifier: is_shown_at = identifier preview = assets.thumbnail_service(identifier, tn) # aggregation.provider # build record try: if is_shown_at: doc = assets.build(oai_id, sourceResource, data_provider, is_shown_at, preview, iprovide) docs.append(doc) except UnboundLocalError: logger.error('No aggregation.isShownAt - {0}'.format(oai_id)) continue return docs
def SSDN_MODS(file_in, tn, dprovide, iprovide=None): with open(file_in, encoding='utf-8') as data_in: logger = assets.CSVLogger('SSDN_MODS', provider=dprovide) records = OAIReader(data_in) docs = [] for record in records: # deleted record handling for repox try: if 'deleted' in record.attrib.keys(): if record.attrib['deleted'] == 'true': pass except AttributeError: pass # deleted record handling for OAI-PMH try: if 'status' in record.find('./{*}header').attrib.keys(): if record.find( './{*}header').attrib['status'] == 'deleted': pass except AttributeError: pass if VERBOSE: print(record.oai_urn) logger.debug(record.oai_urn) sourceResource = {} if record.metadata is None: continue # sourceResource.alternative if len(record.metadata.titles) > 1: sourceResource['alternative'] = [] if len(record.metadata.titles[1:]) >= 1: for alternative_title in record.metadata.titles[1:]: sourceResource['alternative'].append(alternative_title) # sourceResource.collection # sourceResource.contributor try: for name in record.metadata.names: if name.role.text != 'Creator' and name.role.code != 'cre' and name.role.text is not None and name.role.code is not None: sourceResource['contributor'] = [{ "@id": name.uri, "name": name.text } if name.uri else { "name": name.text }] except KeyError as err: logger.error('sourceResource.contributor: {0}, {1}'.format( err, record.oai_urn)) pass # sourceResource.creator name_list = [] if record.metadata.get_creators: for name in record.metadata.get_creators: name_list.append(name) if record.metadata.names: for name in record.metadata.names: if name.role.text is None or name.role.code is None: name_list.append(name) sourceResource['creator'] = [{ "@id": name.uri, "name": name.text } if name.uri else { "name": name.text } for name in name_list] # sourceResource.date if record.metadata.dates: date = record.metadata.dates[0].text if ' - ' in date: sourceResource['date'] = { "displayDate": date, "begin": date[0:4], "end": date[-4:] } else: sourceResource['date'] = { "displayDate": date, "begin": date, "end": date } # sourceResource.description if record.metadata.abstract: sourceResource['description'] = [ abstract.text for abstract in record.metadata.abstract ] try: for toc in record.metadata.iterfind( './/{http://www.loc.gov/mods/v3}tableOfContents'): sourceResource['description'].append(toc.text) except KeyError: sourceResource['description'] = [ toc.text for toc in record.metadata.findall( './/{http://www.loc.gov/mods/v3}tableOfContents') ] # sourceResource.extent if record.metadata.extent: sourceResource['extent'] = record.metadata.extent # sourceResource.format if record.metadata.genre: sourceResource['format'] = [{ 'name': genre.text, '@id': genre.uri } if genre.uri else { 'name': genre.text } for genre in record.metadata.genre] # sourceResource.identifier try: sourceResource['identifier'] = record.metadata.purl[0] except IndexError as err: logger.error('sourceResource.identifier: {0}, {1}'.format( err, record.oai_urn)) continue # sourceResource.language try: if record.metadata.language: sourceResource['language'] = [{ "name": lang.text, "iso_639_3": lang.code } for lang in record.metadata.language] except AttributeError as err: logger.error('sourceResource.language: {0}, {1}'.format( err, record.oai_urn)) pass # sourceResource.place : sourceResource['spatial'] for subject in record.metadata.subjects: for c in subject.elem.getchildren(): if 'eographic' in c.tag: sourceResource['spatial'] = {"name": subject.text} # sourceResource.publisher if record.metadata.publisher: sourceResource['publisher'] = record.metadata.publisher # sourceResource.relation # sourceResource.isReplacedBy # sourceResource.replaces # sourceResource.rights if record.metadata.rights: sourceResource['rights'] = [{ "@id": rights.text } if "http://rightsstatements.org" in rights.text else { "text": rights.text } for rights in record.metadata.rights[:2]] # slicing isn't ideal here since it depends on element order else: logger.error('No sourceResource.rights - {0}'.format( record.oai_urn)) continue # sourceResource.subject try: if record.metadata.subjects: sourceResource['subject'] = [] for subject in record.metadata.subjects: for child in subject.elem: if 'eographic' not in child.tag: sourceResource['subject'].append( {"name": subject.text}) except (TypeError, IndexError) as err: logger.error('sourceResource.subject: {0}, {1}'.format( err, record.oai_urn)) pass # sourceResource.title if record.metadata.titles: sourceResource['title'] = [ '{}'.format(record.metadata.titles[0]) ] else: logger.error('No sourceResource.title: {0}'.format( record.oai_urn)) continue # sourceResource.type sourceResource['type'] = record.metadata.type_of_resource # aggregation.dataProvider data_provider = dprovide # aggregation.intermediateProvider # aggregation.isShownAt # aggregation.preview preview = None pid = record.metadata.pid if pid is None: pid = record.oai_urn.split(':')[-1].replace('_', ':') preview = assets.thumbnail_service(pid, tn) # aggregation.provider # build record try: if record.metadata.purl[0]: doc = assets.build(record.oai_urn, sourceResource, data_provider, record.metadata.purl[0], preview, iprovide) docs.append(doc) except UnboundLocalError: logger.error('No aggregation.isShownAt - {0}'.format( record.oai_urn)) continue return docs
def FlaLD_QDC(file_in, tn, dprovide, iprovide=None): with open(file_in, encoding='utf-8') as data_in: records = OAI_QDC(data_in) docs = [] for record in records.record_list: if 'deleted' in record.attrib.keys(): if record.attrib['deleted'] == 'true': pass else: oai_id = record.attrib['id'] sourceResource = {} # sourceResource.alternative alt_title = OAI_QDC.simple_lookup(record, './/{0}alternative'.format(nameSpace_default['dcterms'])) if alt_title is not None: sourceResource['alternative'] = alt_title # sourceResource.collection # sourceResource.contributor if OAI_QDC.simple_lookup(record, './/{0}contributor'.format(nameSpace_default['dc'])) is not None: sourceResource['contributor'] = [] for element in OAI_QDC.split_lookup(record, './/{0}contributor'.format(nameSpace_default['dc'])): for name in element: if len(name) > 0: sourceResource['contributor'].append({"name": name.strip(" ") }) # sourceResource.creator if OAI_QDC.simple_lookup(record, './/{0}creator'.format(nameSpace_default['dc'])) is not None: sourceResource['creator'] = [] for element in OAI_QDC.split_lookup(record, './/{0}creator'.format(nameSpace_default['dc'])): for name in element: if len(name) > 0: sourceResource['creator'].append({"name": name.strip(" ") }) # sourceResource.date date = OAI_QDC.simple_lookup(record, './/{0}created'.format(nameSpace_default['dcterms'])) if date is not None: sourceResource['date'] = { "begin": date[0], "end": date[0] } # sourceResource.description description = [] if OAI_QDC.simple_lookup(record, './/{0}description'.format(nameSpace_default['dc'])) is not None: for item in OAI_QDC.simple_lookup(record, './/{0}description'.format(nameSpace_default['dc'])): description.append(item) if OAI_QDC.simple_lookup(record, './/{0}abstract'.format(nameSpace_default['dcterms'])) is not None: for item in OAI_QDC.simple_lookup(record, './/{0}abstract'.format(nameSpace_default['dcterms'])): description.append(item) if len(description) > 1: sourceResource['description'] = [] for item in description: sourceResource['description'].append(item) elif len(description) == 1: sourceResource['description'] = description[0] # sourceResource.extent if OAI_QDC.simple_lookup(record, './/{0}extent'.format(nameSpace_default['dcterms'])) is not None: sourceResource['extent'] = [] for element in OAI_QDC.split_lookup(record, './/{0}extent'.format(nameSpace_default['dcterms'])): for term in element: if len(term) > 0: sourceResource['extent'].append(term.strip(' ')) # sourceResource.format # sourceResource.genre if OAI_QDC.simple_lookup(record, './/{0}format'.format(nameSpace_default['dc'])) is not None: sourceResource['genre'] = [] for element in OAI_QDC.split_lookup(record, './/{0}format'.format(nameSpace_default['dc'])): for term in element: if term.lower() in IANA_type_list: file_format = term.lower() pass elif len(term) > 0: sourceResource['genre'].append(term.strip(' ')) if len(sourceResource['genre']) == 0: del sourceResource['genre'] # sourceResource.identifier local_id = OAI_QDC.simple_lookup(record, './/{0}identifier'.format(nameSpace_default['dc'])) if local_id is not None: sourceResource['identifier'] = local_id[0] # sourceResource.language if OAI_QDC.simple_lookup(record, './/{0}language'.format(nameSpace_default['dc'])) is not None: sourceResource['language'] = [] for element in OAI_QDC.split_lookup(record, './/{0}language'.format(nameSpace_default['dc'])): for term in element: if len(term) > 3: sourceResource['language'] = {"name": term } else: sourceResource['language'] = { "iso_639_3": term } # sourceResource.place : sourceResource['spatial'] if OAI_QDC.simple_lookup(record, './/{0}spatial'.format(nameSpace_default['dcterms'])) is not None: sourceResource['spatial'] = [] for element in OAI_QDC.split_lookup(record, './/{0}spatial'.format(nameSpace_default['dcterms'])): for term in element: if len(term) > 0: sourceResource['spatial'].append(term.strip(' ')) # sourceResource.publisher publisher = OAI_QDC.simple_lookup(record, './/{0}publisher'.format(nameSpace_default['dc'])) if publisher is not None: sourceResource['publisher'] = publisher # sourceResource.relation # sourceResource.isReplacedBy # sourceResource.replaces # sourceResource.rights rightsURI = re.compile('http://rightsstatements') if OAI_QDC.simple_lookup(record, './/{0}rights'.format(nameSpace_default['dc'])) is not None: if len(record.findall('.//{0}rights'.format(nameSpace_default['dc']))) > 1: for rights_statement in OAI_QDC.simple_lookup(record, './/{0}rights'.format(nameSpace_default['dc'])): URI = rightsURI.search(rights_statement) if URI: URI_match = URI.string.split(" ")[-1] else: rights_text = rights_statement sourceResource['rights'] = { "@id": URI_match, "text": rights_text } else: sourceResource['rights'] = OAI_QDC.simple_lookup(record, './/{0}rights'.format(nameSpace_default['dc'])) else: logging.warning('No sourceResource.rights - {0}'.format(oai_id)) continue # sourceResource.subject if OAI_QDC.simple_lookup(record, './/{0}subject'.format(nameSpace_default['dc'])) is not None: sourceResource['subject'] = [] for element in OAI_QDC.split_lookup(record, './/{0}subject'.format(nameSpace_default['dc'])): for term in element: if len(term) > 0: sourceResource['subject'].append({"name": term.strip(" ") }) # sourceResource.title title = OAI_QDC.simple_lookup(record, './/{0}title'.format(nameSpace_default['dc'])) if title is not None: sourceResource['title'] = title else: logging.warning('No sourceResource.title - {0}'.format(oai_id)) continue # sourceResource.type if OAI_QDC.simple_lookup(record, './/{0}type'.format(nameSpace_default['dc'])) is not None: sourceResource['type'] = [] for element in OAI_QDC.split_lookup(record, './/{0}type'.format(nameSpace_default['dc'])): for term in element: if len(term) > 0: sourceResource['type'].append(term.strip(" ")) # webResource.fileFormat # aggregation.dataProvider data_provider = dprovide # aggregation.intermediateProvider # aggregation.isShownAt # aggregation.preview for identifier in local_id: if 'http' in identifier: is_shown_at = identifier preview = assets.thumbnail_service(identifier, tn) # aggregation.provider docs.append({"@context": "http://api.dp.la/items/context", "sourceResource": sourceResource, "aggregatedCHO": "#sourceResource", "dataProvider": data_provider, "isShownAt": is_shown_at, "preview": preview, "provider": PROVIDER}) return docs
def SSDN_DC(file_in, tn, dprovide, iprovide=None): def clean_mark_up(text): mark_up_re = re.compile('<.*?>') new_line_re = re.compile('\n') clean_text = re.sub(mark_up_re, '', text) clean_text = re.sub(new_line_re, ' ', clean_text) return clean_text with open(file_in, encoding='utf-8') as data_in: logger = assets.CSVLogger('SSDN_DC', provider=dprovide) records = OAIReader(data_in) docs = [] for record in records: # deleted record handling for repox try: if 'deleted' in record.attrib.keys(): if record.attrib['deleted'] == 'true': continue except AttributeError: pass # deleted record handling for OAI-PMH try: if 'status' in record.find('./{*}header').attrib.keys(): if record.find( './{*}header').attrib['status'] == 'deleted': continue except AttributeError: pass oai_id = record.oai_urn if VERBOSE: print(oai_id) logger.debug(oai_id) sourceResource = {} # sourceResource.alternative # sourceResource.collection if record.metadata.get_element('.//{0}relation'.format(dc)): sourceResource['collection'] = record.metadata.get_element( './/{0}relation'.format(dc)) # sourceResource.contributor if record.metadata.get_element('.//{0}contributor'.format(dc)): sourceResource['contributor'] = [{ "name": name } for name in record.metadata.get_element( './/{0}contributor'.format(dc), delimiter=';')] # sourceResource.creator if record.metadata.get_element('.//{0}creator'.format(dc)): sourceResource['creator'] = [] for name in record.metadata.get_element( './/{0}creator'.format(dc), delimiter=';'): # need to test for ( Contributor ) and ( contributor ) if (len(name) > 0) and ("ontributor )" not in name): sourceResource['creator'].append( {"name": name.strip(" ")}) elif "ontributor )" in name: if 'contributor' not in sourceResource.keys(): sourceResource['contributor'] = [] sourceResource['contributor'].append({ "name": name.strip(" ").rstrip("( Contributor )"). rstrip("( contributor )") }) else: sourceResource['contributor'].append({ "name": name.strip(" ").rstrip("( Contributor )"). rstrip("( contributor )") }) # sourceResource.date date = record.metadata.get_element('.//{0}date'.format(dc)) if date: try: d = dateparser.parse(date[0], languages=['en']).date().isoformat() sourceResource['date'] = { "begin": d, "end": d, "displayDate": d } except AttributeError as err: logger.warning('sourceResource.date: {0}, {1}'.format( err, record.oai_urn)) sourceResource['date'] = date[0] # sourceResource.description if record.metadata.get_element('.//{0}description'.format(dc)): sourceResource['description'] = [ clean_mark_up(desc) for desc in record.metadata.get_element( './/{0}description'.format(dc), delimiter=';') ] # sourceResource.extent # sourceResource.format if record.metadata.get_element('.//{0}format'.format(dc)): sourceResource['format'] = record.metadata.get_element( './/{0}format'.format(dc)) # sourceResource.genre # sourceResource.identifier sourceResource['identifier'] = oai_id # sourceResource.language if record.metadata.get_element('.//{0}language'.format(dc)): sourceResource['language'] = [] for lang in record.metadata.get_element( './/{0}language'.format(dc), delimiter=';'): sourceResource['language'].append(lang) # sourceResource.place : sourceResource['spatial'] if record.metadata.get_element('.//{0}coverage'.format(dc)): sourceResource['spatial'] = [{ 'name': place } for place in record.metadata.get_element( './/{0}coverage'.format(dc))] # sourceResource.publisher publisher = record.metadata.get_element( './/{0}publisher'.format(dc)) if publisher: sourceResource['publisher'] = publisher # sourceResource.relation # sourceResource.isReplacedBy # sourceResource.replaces # sourceResource.rights rights_uri = re.compile('http://rightsstatements') if record.metadata.get_element('.//{0}rights'.format(dc)): for rights_statement in record.metadata.get_element( './/{0}rights'.format(dc)): uri = rights_uri.search(rights_statement) if uri: sourceResource['rights'] = [{ "@id": uri.string.strip() }] break else: sourceResource['rights'] = [{ "text": rights_statement.strip() }] else: logger.error('No sourceResource.rights - {0}'.format(oai_id)) continue # sourceResource.subject if record.metadata.get_element('.//{0}subject'.format(dc)): sourceResource['subject'] = [] for term in record.metadata.get_element( './/{0}subject'.format(dc), delimiter=';'): term = re.sub("\( lcsh \)$", '', term) if len(term) > 0: sourceResource['subject'].append( {"name": term.strip(". ")}) # sourceResource.temporal # sourceResource.title title = record.metadata.get_element('.//{0}title'.format(dc)) if title is not None: sourceResource['title'] = title else: logger.error('No sourceResource.title - {0}'.format(oai_id)) continue # sourceResource.type if record.metadata.get_element('.//{0}type'.format(dc)): sourceResource['type'] = record.metadata.get_element( './/{0}type'.format(dc), delimiter=';') # webResource.fileFormat # TODO: file_format kicked out of SR.genre # aggregation.dataProvider data_provider = dprovide # aggregation.intermediateProvider # aggregation.isShownAt for identifier in record.metadata.get_element( './/{0}identifier'.format(dc)): if 'http' in identifier: is_shown_at = identifier # aggregation.preview preview = None try: preview = assets.thumbnail_service(record, tn) except (TypeError, UnboundLocalError) as err: logger.warning('aggregation.preview: {0} - {1}'.format( err, oai_id)) pass # aggregation.provider # build record try: if is_shown_at: doc = assets.build(oai_id, sourceResource, data_provider, is_shown_at, preview, iprovide) docs.append(doc) except (NameError, UnboundLocalError): logger.error('No aggregation.isShownAt - {0}'.format(oai_id)) continue return docs
def FlaLD_MODS(file_in, tn, dprovide, iprovide=None): with open(file_in, encoding='utf-8') as data_in: records = MODSReader(data_in) docs = [] for record in records: sourceResource = {} # sourceResource.alternative if record.title_constructor() is not None and record.title_constructor()[1:] is not None: sourceResource['alternative'] = [] if len(record.title_constructor()[1:]) >= 1: for alternative_title in record.title_constructor()[1:]: sourceResource['alternative'].append(alternative_title) # sourceResource.collection if record.collection() is not None: collection = record.collection() sourceResource['collection'] = {} if 'title' in collection.keys(): sourceResource['collection']['name'] = collection['title'] if 'location' in collection.keys(): sourceResource['collection']['host'] = collection['location'] if 'url' in collection.keys(): sourceResource['collection']['_:id'] = collection['url'] # sourceResource.contributor try: if record.name_constructor() is not None: sourceResource['contributor'] = [] for name in record.name_constructor(): if any(key in name.keys() for key in ['roleText', 'roleCode']) is False: if 'valueURI' in name.keys(): sourceResource['contributor'].append({"@id": name['valueURI'], "name": name['text']} ) else: sourceResource['contributor'].append({"name": name['text']} ) elif 'roleText' in name.keys(): if name['roleText'].lower() != 'creator': if 'valueURI' in name.keys(): sourceResource['contributor'].append({ "@id": name['valueURI'], "name": name['text'] }) else: sourceResource['contributor'].append({ "name": name['text'] }) elif 'roleCode' in name.keys(): if name['roleCode'].lower() != 'cre': if 'valueURI' in name.keys(): sourceResource['contributor'].append({ "@id": name['valueURI'], "name": name['text'] }) else: sourceResource['contributor'].append({ "name": name['text'] }) else: pass if len(sourceResource['contributor']) < 1: del sourceResource['contributor'] except KeyError as err: logging.warning('sourceResource.contributor: {0}, {1}\n'.format(err, record.pid_search())) pass if record.name_constructor() is not None: sourceResource['creator'] = [] for name in record.name_constructor(): if 'roleText' in name.keys(): if name['roleText'].lower() == 'creator': if 'valueURI' in name.keys(): sourceResource['creator'].append({ "@id": name['valueURI'], "name": name['text'] }) else: sourceResource['creator'].append({ "name": name['text'] }) elif 'roleCode' in name.keys(): if name['roleCode'].lower() == 'cre': if 'valueURI' in name.keys(): sourceResource['creator'].append({ "@id": name['valueURI'], "name": name['text'] }) else: sourceResource['creator'].append({ "name": name['text'] }) else: pass # sourceResource.date if record.date_constructor() is not None: date = record.date_constructor() if ' - ' in date: sourceResource['date'] = { "displayDate": date, "begin": date[0:4], "end": date[-4:] } else: sourceResource['date'] = { "displayDate": date, "begin": date, "end": date } # sourceResource.description if record.abstract() is not None: if len(record.abstract()) > 1: sourceResource['description'] = [] for description in record.abstract(): sourceResource['description'].append(description) else: sourceResource['description'] = record.abstract() # sourceResource.extent if record.extent() is not None: if len(record.extent()) > 1: sourceResource['extent'] = [] for extent in record.extent(): sourceResource['extent'].append(extent) else: sourceResource['extent'] = record.extent()[0] # sourceResource.format if record.form() is not None: if len(record.form()) > 1: sourceResource['format'] = [] for form in record.form(): sourceResource['format'].append(form) else: sourceResource['format'] = record.form()[0] # sourceResource.genre if record.genre() is not None: if len(record.genre()) > 1: sourceResource['genre'] = [] for genre in record.genre(): genre_elem = {} for key, value in genre.items(): if 'term' == key: genre_elem['name'] = value elif 'valueURI' == key: genre_elem['@id'] = value sourceResource['genre'].append(genre_elem) else: genre_elem = {} for key, value in record.genre()[0].items(): if 'term' == key: genre_elem['name'] = value elif 'valueURI' == key: genre_elem['@id'] = value sourceResource['genre'] = genre_elem # sourceResource.identifier sourceResource['identifier'] = { "@id": record.purl_search(), "text": record.local_identifier() } # sourceResource.language if record.language() is not None: language_list = [] for language in record.language(): if len(language) > 1: language_dict = { "name": language['text'], "iso_639_3": language['code'] } else: if 'text' in language.keys(): language_dict = { "name": language['text'] } else: pass language_list.append(language_dict) sourceResource['language'] = language_list # sourceResource.place : sourceResource['spatial'] geo_code_list = record.geographic_code() if geo_code_list is not None: sourceResource['spatial'] = [] for geo_code in geo_code_list: code, lat, long, label = assets.tgn_cache(geo_code) sourceResource['spatial'].append({"lat": lat, "long": long, "name": label, "_:attribution": "This record contains information from Thesaurus of Geographic Names (TGN) which is made available under the ODC Attribution License." }) #tgn_prefix = 'http://vocab.getty.edu/tgn/' ''' # Implementation using the schema.org namespace # tgn_geometry = geo_code + '-geometry.jsonld' # geometry = requests.get(tgn_prefix + tgn_geometry) # geometry_json = json.loads(geometry.text) # lat = geometry_json['http://schema.org/latitude']['@value'] # long = geometry_json['http://schema.org/longitude']['@value'] ''' #tgn_place = geo_code + '-place.jsonld' #place = requests.get(tgn_prefix + tgn_place) #if place.status_code == 200: # place_json = json.loads(place.text) # lat = place_json['http://www.w3.org/2003/01/geo/wgs84_pos#lat']['@value'] # long = place_json['http://www.w3.org/2003/01/geo/wgs84_pos#long']['@value'] # sourceResource['spatial'].append({ "lat": lat, # "long": long, # "_:attribution": "This record contains information from Thesaurus of Geographic Names (TGN) which is made available under the ODC Attribution License." }) # sourceResource.publisher if record.publisher() is not None: if len(record.publisher()) > 1: sourceResource['publisher'] = [] for publisher in record.publisher(): sourceResource['publisher'].append(publisher) else: sourceResource['publisher'] = record.publisher()[0] # sourceResource.relation # sourceResource.isReplacedBy # sourceResource.replaces # sourceResource.rights if record.rights() is not None: if len(record.rights()) > 1: sourceResource['rights'] = {"@id": record.rights()['URI'], "text": record.rights()['text']} else: sourceResource['rights'] = record.rights()['text'] else: logging.warning('No sourceResource.rights - {0}'.format(record.pid_search())) continue # sourceResource.subject try: if record.subject() is not None: sourceResource['subject'] = [] for subject in record.subject(): non_alpha_char = re.compile("^[^a-zA-Z]+$") if non_alpha_char.match(subject['text']) is None: if 'valueURI' in subject.keys(): sourceResource['subject'].append({"@id": subject['valueURI'], "name": subject['text'] }) else: sourceResource['subject'].append({"name": subject['text'] }) else: pass except TypeError as err: logging.warning('sourceResource.subject: {0}, {1}\n'.format(err, record.pid_search())) pass # sourceResource.title if record.title_constructor() is not None: sourceResource['title'] = record.title_constructor()[0] else: logging.warning('No sourceResource.title: {0}'.format(record.pid_search())) continue # sourceResource.type sourceResource['type'] = record.type_of_resource() # aggregation.dataProvider data_provider = dprovide # aggregation.intermediateProvider #TODO # aggregation.isShownAt # aggregation.preview pid = record.pid_search() preview = assets.thumbnail_service(pid, tn) # aggregation.provider docs.append({"@context": "http://api.dp.la/items/context", "sourceResource": sourceResource, "aggregatedCHO": "#sourceResource", "dataProvider": data_provider, "isShownAt": record.purl_search(), "preview": preview, "provider": PROVIDER}) return docs
def FlaLD_DC(file_in, tn, dprovide, iprovide=None): with open(file_in, encoding='utf-8') as data_in: records = OAI_QDC(data_in) docs = [] for record in records.record_list: if 'deleted' in record.attrib.keys(): if record.attrib['deleted'] == 'true': pass else: oai_id = record.attrib['id'] sourceResource = {} # sourceResource.alternative # sourceResource.collection # sourceResource.contributor if OAI_QDC.simple_lookup(record, './/{0}contributor'.format(nameSpace_default['dc'])) is not None: sourceResource['contributor'] = [] for element in OAI_QDC.split_lookup(record, './/{0}contributor'.format(nameSpace_default['dc'])): for name in element: if len(name) > 0: sourceResource['contributor'].append({"name": name.strip(" ") }) # sourceResource.creator if OAI_QDC.simple_lookup(record, './/{0}creator'.format(nameSpace_default['dc'])) is not None: sourceResource['creator'] = [] for element in OAI_QDC.split_lookup(record, './/{0}creator'.format(nameSpace_default['dc'])): for name in element: # need to test for ( Contributor ) and ( contributor ) if len(name) > 0 and "ontributor )" not in name: sourceResource['creator'].append({"name": name.strip(" ") }) elif "ontributor )" in name: if 'contributor' not in sourceResource.keys(): sourceResource['contributor'] = [] sourceResource['contributor'].append({"name": name.strip(" ").rstrip("( Contributor )").rstrip("( contributor )")}) else: sourceResource['contributor'].append( {"name": name.strip(" ").rstrip("( Contributor )").rstrip("( contributor )")}) # sourceResource.date date = OAI_QDC.simple_lookup(record, './/{0}date'.format(nameSpace_default['dc'])) if date is not None: sourceResource['date'] = { "begin": date[0], "end": date[0] } # sourceResource.description description = [] if OAI_QDC.simple_lookup(record, './/{0}description'.format(nameSpace_default['dc'])) is not None: for item in OAI_QDC.simple_lookup(record, './/{0}description'.format(nameSpace_default['dc'])): description.append(item) if len(description) > 1: sourceResource['description'] = [] for item in description: sourceResource['description'].append(item) elif len(description) == 1: sourceResource['description'] = description[0] # sourceResource.extent # sourceResource.format dpla_format = OAI_QDC.simple_lookup(record, './/{0}format'.format(nameSpace_default['dc'])) if dpla_format is not None: sourceResource['format'] = dpla_format # sourceResource.genre # sourceResource.identifier dPantherPURL = re.compile('dpService/dpPurlService/purl') identifier = OAI_QDC.simple_lookup(record, './/{0}identifier'.format(nameSpace_default['dc'])) if identifier is not None and len(identifier) > 1: sourceResource['identifier'] = [] for ID in identifier: try: PURL = dPantherPURL.search(ID) if PURL: PURL_match = PURL.string else: sourceResource['identifier'].append(ID) except TypeError as err: logging.warning('sourceResource.identifier: {0} - {1}\n'.format(err, oai_id)) pass else: sourceResource['identifier'] = identifier # sourceResource.language if OAI_QDC.simple_lookup(record, './/{0}language'.format(nameSpace_default['dc'])) is not None: sourceResource['language'] = [] for element in OAI_QDC.split_lookup(record, './/{0}language'.format(nameSpace_default['dc'])): for term in element: if len(term) > 3: sourceResource['language'] = {"name": term } else: sourceResource['language'] = { "iso_639_3": term } # sourceResource.place : sourceResource['spatial'] place = OAI_QDC.simple_lookup(record, './/{0}coverage'.format(nameSpace_default['dc'])) if place is not None: sourceResource['spatial'] = place # sourceResource.publisher publisher = OAI_QDC.simple_lookup(record, './/{0}publisher'.format(nameSpace_default['dc'])) if publisher is not None: sourceResource['publisher'] = publisher # sourceResource.relation # sourceResource.isReplacedBy # sourceResource.replaces # sourceResource.rights rights = OAI_QDC.simple_lookup(record, './/{0}rights'.format(nameSpace_default['dc'])) if rights is not None: sourceResource['rights'] = rights else: logging.warning('No sourceResource.rights - {0}'.format(oai_id)) continue # sourceResource.subject if OAI_QDC.simple_lookup(record, './/{0}subject'.format(nameSpace_default['dc'])) is not None: sourceResource['subject'] = [] for element in OAI_QDC.split_lookup(record, './/{0}subject'.format(nameSpace_default['dc'])): for term in element: term = re.sub("\( lcsh \)$", '', term) if len(term) > 0: sourceResource['subject'].append({"name": term.strip(" ") }) # sourceResource.title title = OAI_QDC.simple_lookup(record, './/{0}title'.format(nameSpace_default['dc'])) if title is not None: sourceResource['title'] = title else: logging.warning('No sourceResource.rights - {0}'.format(oai_id)) continue # sourceResource.type if OAI_QDC.simple_lookup(record, './/{0}type'.format(nameSpace_default['dc'])) is not None: sourceResource['type'] = [] for element in OAI_QDC.split_lookup(record, './/{0}type'.format(nameSpace_default['dc'])): for term in element: if len(term) > 0: sourceResource['type'].append(term.strip(" ")) # webResource.fileFormat # aggregation.dataProvider data_provider = dprovide # aggregation.intermediateProvider # aggregation.isShownAt # aggregation.preview preview = assets.thumbnail_service(PURL_match, tn) # aggregation.provider try: docs.append({"@context": "http://api.dp.la/items/context", "sourceResource": sourceResource, "aggregatedCHO": "#sourceResource", "dataProvider": data_provider, "isShownAt": PURL_match, "preview": preview, "provider": PROVIDER}) except NameError as err: logging.warning('aggregation.preview: {0} - {1}\n'.format(err, oai_id)) pass return docs
def FlMem(file_in, tn, dprovide, iprovide=None): with open(file_in, encoding='utf-8') as data_in: logger = assets.CSVLogger('FlMem', provider=dprovide) records = OAIReader(data_in) docs = [] for record in records: # deleted record handling for repox try: if 'deleted' in record.attrib.keys(): if record.attrib['deleted'] == 'true': continue except AttributeError: pass # deleted record handling for OAI-PMH try: if 'status' in record.find('./{*}header').attrib.keys(): if record.find( './{*}header').attrib['status'] == 'deleted': continue except AttributeError: pass oai_id = record.oai_urn if VERBOSE: print(oai_id) logger.debug(oai_id) sourceResource = {} # sourceResource.alternative # sourceResource.collection if record.metadata.get_element('.//{0}source'.format(dc)): sourceResource['collection'] = { 'name': record.metadata.get_element('.//{0}source'.format(dc))[0] } # sourceResource.contributor if record.metadata.get_element('.//{0}contributor'.format(dc)): sourceResource['contributor'] = [{ "name": name } for name in record.metadata.get_element( './/{0}contributor'.format(dc), delimiter=';')] # sourceResource.creator if record.metadata.get_element('.//{0}creator'.format(dc)): sourceResource['creator'] = [] for name in record.metadata.get_element( './/{0}creator'.format(dc), delimiter=';'): # need to test for ( Contributor ) and ( contributor ) if (len(name) > 0) and ("ontributor )" not in name): sourceResource['creator'].append( {"name": name.strip(" ")}) elif "ontributor )" in name: if 'contributor' not in sourceResource.keys(): sourceResource['contributor'] = [] sourceResource['contributor'].append({ "name": name.strip(" ").rstrip("( Contributor )"). rstrip("( contributor )") }) else: sourceResource['contributor'].append({ "name": name.strip(" ").rstrip("( Contributor )"). rstrip("( contributor )") }) # sourceResource.date date = record.metadata.get_element('.//{0}date'.format(dc)) if date: sourceResource['date'] = { "begin": date[0], "end": date[0], "displayDate": date[0] } # sourceResource.description if record.metadata.get_element('.//{0}description'.format(dc)): sourceResource['description'] = record.metadata.get_element( './/{0}description'.format(dc), delimiter=';') # sourceResource.extent # sourceResource.format if record.metadata.get_element('.//{0}format'.format(dc)): sourceResource['format'] = record.metadata.get_element( './/{0}format'.format(dc)) # sourceResource.genre # sourceResource.identifier for identifier in record.metadata.get_element( './/{0}identifier'.format(dc)): if 'http' in identifier: is_shown_at = identifier.replace( identifier.split('/')[2], 'www.floridamemory.com') is_shown_at = is_shown_at.replace('http:', 'https:') sourceResource['identifier'] = oai_id.replace( oai_id.split(':')[1], 'www.floridamemory.com') # sourceResource.language if record.metadata.get_element('.//{0}language'.format(dc)): sourceResource['language'] = [] for lang in record.metadata.get_element( './/{0}language'.format(dc), delimiter=';'): results = assets.iso639_2code(lang.split('-')[0]) sourceResource['language'].append(results) # sourceResource.place : sourceResource['spatial'] if record.metadata.get_element('.//{0}coverage'.format(dc)): sourceResource['spatial'] = [{ 'name': place } for place in record.metadata.get_element( './/{0}coverage'.format(dc))] # sourceResource.publisher if record.metadata.get_element('.//{0}publisher'.format(dc)): sourceResource['publisher'] = record.metadata.get_element( './/{0}publisher'.format(dc)) # sourceResource.relation # sourceResource.isReplacedBy # sourceResource.replaces # sourceResource.rights # TODO: hard-coding is only temporary sourceResource['rights'] = { '@id': 'http://rightsstatements.org/vocab/NoC-US/1.0/' } # rights = record.metadata.get_element('.//{0}rights'.format(dc)) # if rights: # sourceResource['rights'] = [{'text': rights[0]}] # else: # logger.error('No sourceResource.rights - {0}'.format(oai_id)) # # continue # TODO renable for prod # pass # local test # sourceResource.subject if record.metadata.get_element('.//{0}subject'.format(dc)): sourceResource['subject'] = [] for term in record.metadata.get_element( './/{0}subject'.format(dc), delimiter=';'): term = re.sub("\( lcsh \)$", '', term) if len(term) > 0: sourceResource['subject'].append( {"name": term.strip(". ")}) # sourceResource.title title = record.metadata.get_element('.//{0}title'.format(dc)) if title: sourceResource['title'] = title else: logger.error('No sourceResource.rights - {0}'.format(oai_id)) continue # sourceResource.temporal temporal = record.metadata.get_element('.//{0}coverage'.format(dc)) if temporal: sourceResource['temporal'] = temporal # sourceResource.type if record.metadata.get_element('.//{0}type'.format(dc)): if 'type' in sourceResource.keys(): sourceResource['type'] = sourceResource[ 'type'] + record.metadata.get_element( './/{0}type'.format(dc)) else: sourceResource['type'] = record.metadata.get_element( './/{0}type'.format(dc), delimiter=';') if record.metadata.get_element('.//{0}format'.format(dc)): if 'type' in sourceResource.keys(): sourceResource['type'] = sourceResource[ 'type'] + record.metadata.get_element( './/{0}format'.format(dc)) else: sourceResource['type'] = record.metadata.get_element( './/{0}format'.format(dc), delimiter=';') # webResource.fileFormat # aggregation.dataProvider data_provider = dprovide # aggregation.intermediateProvider # aggregation.isShownAt # aggregation.preview preview = None try: preview = assets.thumbnail_service(is_shown_at, tn) except UnboundLocalError as err: logger.error('aggregation.preview: {0} - {1}'.format( err, oai_id)) pass # aggregation.provider # build record try: if is_shown_at: doc = assets.build(oai_id, sourceResource, data_provider, is_shown_at, preview, iprovide) docs.append(doc) except UnboundLocalError: logger.error('No aggregation.isShownAt - {0}'.format(oai_id)) continue return docs
def FlaLD_MODS(file_in, tn, dprovide, iprovide=None): with open(file_in, encoding='utf-8') as data_in: logger = assets.CSVLogger('FlaLD_MODS', provider=dprovide) records = OAIReader(data_in) docs = [] for record in records: # deleted record handling for repox try: if 'deleted' in record.attrib.keys(): if record.attrib['deleted'] == 'true': pass except AttributeError: pass # deleted record handling for OAI-PMH try: if 'status' in record.find('./{*}header').attrib.keys(): if record.find( './{*}header').attrib['status'] == 'deleted': pass except AttributeError: pass if VERBOSE: print(record.oai_urn) logger.debug(record.oai_urn) sourceResource = {} if record.metadata is None: continue # sourceResource.alternative if len(record.metadata.titles) > 1: sourceResource['alternative'] = [] if len(record.metadata.titles[1:]) >= 1: for alternative_title in record.metadata.titles[1:]: sourceResource['alternative'].append(alternative_title) # sourceResource.collection if record.metadata.collection: collection = record.metadata.collection sourceResource['collection'] = {} if collection.title: sourceResource['collection']['name'] = collection.title if collection.location: sourceResource['collection']['host'] = collection.location if collection.url: sourceResource['collection']['_:id'] = collection.url # sourceResource.contributor try: for name in record.metadata.names: if name.role.text != 'Creator' or name.role.code != 'cre': sourceResource['contributor'] = [{ "@id": name.uri, "name": name.text } if name.uri else { "name": name.text }] except KeyError as err: logger.error('sourceResource.contributor: {0}, {1}'.format( err, record.oai_urn)) pass # sourceResource.creator if record.metadata.get_creators: sourceResource['creator'] = [{ "@id": name.uri, "name": name.text } if name.uri else { "name": name.text } for name in record.metadata.get_creators] # sourceResource.date if record.metadata.dates: date = record.metadata.dates[0].text if ' - ' in date: sourceResource['date'] = { "displayDate": date, "begin": date[0:4], "end": date[-4:] } else: sourceResource['date'] = { "displayDate": date, "begin": date, "end": date } # sourceResource.description if record.metadata.abstract: sourceResource['description'] = [ abstract.text for abstract in record.metadata.abstract ] # sourceResource.extent if record.metadata.extent: sourceResource['extent'] = record.metadata.extent # sourceResource.format if record.metadata.form: sourceResource['format'] = record.metadata.form # sourceResource.genre if record.metadata.genre: sourceResource['genre'] = [{ 'name': genre.text, '@id': genre.uri } if genre.uri else { 'name': genre.text } for genre in record.metadata.genre] # sourceResource.identifier try: sourceResource['identifier'] = record.metadata.purl[0] except IndexError as err: logger.error('sourceResource.identifier: {0}, {1}'.format( err, record.oai_urn)) continue # sourceResource.language try: if record.metadata.language: sourceResource['language'] = [{ "name": lang.text, "iso_639_3": lang.code } for lang in record.metadata.language] except AttributeError as err: logger.error('sourceResource.language: {0}, {1}'.format( err, record.oai_urn)) pass # sourceResource.place : sourceResource['spatial'] try: if record.metadata.geographic_code and len( record.metadata.geographic_code) > 0: sourceResource['spatial'] = [] for geo_code in record.metadata.geographic_code: code, lat, long, label = assets.tgn_cache( geo_code.strip()) sourceResource['spatial'].append({ "lat": lat, "long": long, "name": label, "_:attribution": "This record contains information from Thesaurus of Geographic Names (TGN) which is made available under the ODC Attribution License." }) except TypeError as err: logger.error('sourceResource.spatial: {0}, {1}'.format( err, record.oai_urn)) continue # sourceResource.publisher if record.metadata.publisher: sourceResource['publisher'] = record.metadata.publisher # sourceResource.relation # sourceResource.isReplacedBy # sourceResource.replaces # sourceResource.rights if record.metadata.rights: sourceResource['rights'] = [{ "@id": rights.uri } if rights.uri else { "text": rights.text } for rights in record.metadata.rights] else: logger.error('No sourceResource.rights - {0}'.format( record.oai_urn)) continue # sourceResource.subject try: if record.metadata.subjects: sourceResource['subject'] = [{ "@id": subject.uri, "name": subject.text } if subject.uri is not None else { "name": subject.text } for subject in record.metadata.subjects] except (TypeError, IndexError) as err: logger.error('sourceResource.subject: {0}, {1}'.format( err, record.oai_urn)) pass # sourceResource.title if record.metadata.titles: sourceResource['title'] = [ '{}'.format(record.metadata.titles[0]) ] else: logger.error('No sourceResource.title: {0}'.format( record.oai_urn)) continue # sourceResource.type sourceResource['type'] = record.metadata.type_of_resource # aggregation.dataProvider first_baptist = re.compile('^FSU_FBCTLH') leon_high = re.compile('^FSU_LeonHigh') godby_high = re.compile('^FSU_Godby') havana_hhs = re.compile('^FSU_HHHS') # ringling = re.compile('^FSU_Ringling') first_baptist_iid = first_baptist.search(record.metadata.iid) leon_high_iid = leon_high.search(record.metadata.iid) godby_high_iid = godby_high.search(record.metadata.iid) havana_hhs_iid = havana_hhs.search(record.metadata.iid) # ringling_iid = ringling.search(record.metadata.iid) if first_baptist_iid: data_provider = 'First Baptist Church of Tallahassee' iprovide = 'Florida State University Libraries' elif leon_high_iid: data_provider = 'Leon High School, Tallahassee, Florida' iprovide = 'Florida State University Libraries' elif godby_high_iid: data_provider = 'Godby High School, Tallahassee, Florida' iprovide = 'Florida State University Libraries' elif havana_hhs_iid: data_provider = 'Havana History & Heritage Society, Havana, Florida' iprovide = 'Florida State University Libraries' # elif ringling_iid: # data_provider = 'John and Mable Ringling Museum of Art' # iprovide = 'Florida State University Libraries' else: data_provider = dprovide # aggregation.intermediateProvider # aggregation.isShownAt # aggregation.preview preview = None pid = record.metadata.pid if pid is None: pid = record.oai_urn.split(':')[-1].replace('_', ':') preview = assets.thumbnail_service(pid, tn) # aggregation.provider # build record try: if record.metadata.purl[0]: doc = assets.build(record.oai_urn, sourceResource, data_provider, record.metadata.purl[0], preview, iprovide) docs.append(doc) except UnboundLocalError: logger.error('No aggregation.isShownAt - {0}'.format( record.oai_urn)) continue return docs
def FlaLD_DC(file_in, tn, dprovide, iprovide=None): with open(file_in, encoding='utf-8') as data_in: logger = assets.CSVLogger('FlaLD_DC', provider=dprovide) records = OAIReader(data_in) docs = [] for record in records: # deleted record handling for repox try: if 'deleted' in record.attrib.keys(): if record.attrib['deleted'] == 'true': continue except AttributeError: pass # deleted record handling for OAI-PMH try: if 'status' in record.find('./{*}header').attrib.keys(): if record.find( './{*}header').attrib['status'] == 'deleted': continue except AttributeError: pass oai_id = record.oai_urn if VERBOSE: print(oai_id) logger.debug(oai_id) sourceResource = {} # sourceResource.alternative # sourceResource.collection # sourceResource.contributor if record.metadata.get_element('.//{0}contributor'.format(dc)): sourceResource['contributor'] = [{ "name": name } for name in record.metadata.get_element( './/{0}contributor'.format(dc), delimiter=';')] # sourceResource.creator if record.metadata.get_element('.//{0}creator'.format(dc)): sourceResource['creator'] = [] for name in record.metadata.get_element( './/{0}creator'.format(dc), delimiter=';'): # need to test for ( Contributor ) and ( contributor ) if (len(name) > 0) and ("ontributor )" not in name): sourceResource['creator'].append( {"name": name.strip(" ")}) elif "ontributor )" in name: if 'contributor' not in sourceResource.keys(): sourceResource['contributor'] = [] sourceResource['contributor'].append({ "name": name.strip(" ").rstrip("( Contributor )"). rstrip("( contributor )") }) else: sourceResource['contributor'].append({ "name": name.strip(" ").rstrip("( Contributor )"). rstrip("( contributor )") }) # sourceResource.date date = record.metadata.get_element('.//{0}date'.format(dc)) if date: sourceResource['date'] = { "begin": date[0], "end": date[0], "displayDate": date[0] } # sourceResource.description if record.metadata.get_element('.//{0}description'.format(dc)): sourceResource['description'] = record.metadata.get_element( './/{0}description'.format(dc), delimiter=';') # sourceResource.extent # sourceResource.format if record.metadata.get_element('.//{0}format'.format(dc)): sourceResource['format'] = record.metadata.get_element( './/{0}format'.format(dc)) # sourceResource.genre # sourceResource.identifier dPantherPURL = re.compile( 'http://dpanther.fiu.edu/dpService/dpPurlService') dPantherURL = re.compile('http://dpanther') identifier = record.metadata.get_element( './/{0}identifier'.format(dc)) try: for ID in identifier: if dPantherPURL.search(ID): PURL_match = ID sourceResource['identifier'] = ID break elif dPantherURL.search(ID): sourceResource['identifier'] = ID logger.warning( 'sourceResource.identifier: {0} - {1}'.format( 'Not a PURL', oai_id)) is_shown_at = sourceResource['identifier'] except (TypeError, UnboundLocalError) as err: logger.error('sourceResource.identifier: {0} - {1}'.format( err, oai_id)) continue # sourceResource.language if record.metadata.get_element('.//{0}language'.format(dc)): sourceResource['language'] = [] for element in record.metadata.get_element( './/{0}language'.format(dc), delimiter=';'): if len(element) > 3: sourceResource['language'].append({"name": element}) else: sourceResource['language'].append( {"iso_639_3": element}) # sourceResource.place : sourceResource['spatial'] if record.metadata.get_element('.//{0}coverage'.format(dc)): sourceResource['spatial'] = [{ 'name': place } for place in record.metadata.get_element( './/{0}coverage'.format(dc))] # sourceResource.publisher if record.metadata.get_element('.//{0}publisher'.format(dc)): sourceResource['publisher'] = record.metadata.get_element( './/{0}publisher'.format(dc)) # sourceResource.relation # sourceResource.isReplacedBy # sourceResource.replaces # sourceResource.rights rights = record.metadata.get_element('.//{0}rights'.format(dc)) if rights: sourceResource['rights'] = [{'text': rights[0]}] else: logger.error('No sourceResource.rights - {0}'.format(oai_id)) continue # sourceResource.subject if record.metadata.get_element('.//{0}subject'.format(dc)): sourceResource['subject'] = [] for term in record.metadata.get_element( './/{0}subject'.format(dc), delimiter=';'): term = re.sub("\( lcsh \)$", '', term) if len(term) > 0: sourceResource['subject'].append( {"name": term.strip(" ")}) # sourceResource.title title = record.metadata.get_element('.//{0}title'.format(dc)) if title: sourceResource['title'] = title else: logger.error('No sourceResource.rights - {0}'.format(oai_id)) continue # sourceResource.type if record.metadata.get_element('.//{0}type'.format(dc)): sourceResource['type'] = record.metadata.get_element( './/{0}type'.format(dc), delimiter=';') # webResource.fileFormat # aggregation.dataProvider data_provider = dprovide # aggregation.intermediateProvider # aggregation.isShownAt # aggregation.preview preview = None try: preview = assets.thumbnail_service(record, tn) except (TypeError, UnboundLocalError) as err: logger.warning('aggregation.preview: {0} - {1}'.format( err, oai_id)) pass # aggregation.provider # build record try: if is_shown_at: doc = assets.build(oai_id, sourceResource, data_provider, is_shown_at, preview, iprovide) docs.append(doc) except UnboundLocalError: logger.error('No aggregation.isShownAt - {0}'.format(oai_id)) continue return docs