class Extra: name_identifier = tools.Try(ctx.nameIdentifier) name_identifier_scheme = tools.Try( ctx.nameIdentifier['@nameIdentifierScheme']) name_identifier_scheme_uri = tools.Try( ctx.nameIdentifier['@schemeURI']) contributor_type = tools.Try(ctx.contributorType) # v.4 new givenName and familyName properties given_name = tools.OneOf(ctx.creatorName['@givenName'], ctx.contributorName['@givenName'], tools.Static(None)) family_name = tools.OneOf(ctx.creatorName['@familyName'], ctx.contributorName['@familyName'], tools.Static(None))
class CreativeWork(Parser): schema = tools.RunPython('get_type', ctx) title = tools.RunPython('get_title', ctx) description = Soup(ctx, 'p', class_='genericfile_description')['#text'] date_published = tools.ParseDate( Soup(ctx, itemprop='datePublished')['#text']) date_updated = tools.ParseDate(Soup(ctx, itemprop='dateModified')['#text']) rights = tools.OneOf(tools.RunPython('get_rights_url', ctx), tools.RunPython('get_dd', ctx, 'Rights')['#text'], tools.Static(None)) language = tools.Try( tools.ParseLanguage(Soup(ctx, itemprop='inLanguage')['#text'])) tags = tools.Map(tools.Delegate(ThroughTags), Soup(ctx, itemprop='keywords')) identifiers = tools.Map( tools.Delegate(WorkIdentifier), tools.Try(tools.RunPython('get_dd', ctx, 'Permanent Link')), ) related_agents = tools.Concat( tools.Map(tools.Delegate(Creator), Soup(ctx, itemprop='creator')), tools.Map(tools.Delegate(Contributor), Soup(ctx, itemprop='contributor')), tools.Map(tools.Delegate(Publisher), Soup(ctx, itemprop='publisher')), ) class Extra: gwu_unit = tools.RunPython('get_dd', ctx, 'GW Unit')['#text'] related_url = tools.RunPython('get_dd', ctx, 'Related URL')['#text'] previous_publication_information = tools.RunPython( 'get_dd', ctx, 'Previous Publication Information')['#text'] depositor = tools.RunPython('get_dd', ctx, 'Depositor')['#text'] characterization = tools.RunPython('get_dd', ctx, 'Characterization')['#text'] def get_type(self, obj): return { 'http://schema.org/CreativeWork': 'CreativeWork', 'http://schema.org/Article': 'Article', 'http://schema.org/Book': 'Book', }.get(obj.soup.find('div')['itemtype'], 'CreativeWork') def get_title(self, obj): title = obj.h1.soup title.find('span', class_='label').decompose() return title.get_text() def get_dd(self, obj, dt): dt_tag = obj.soup.find('dt', string=dt) if dt_tag: return SoupXMLDict(soup=dt_tag.find_next_sibling('dd')) return None def get_rights_url(self, obj): dd = self.get_dd(obj, 'Rights') return dd.soup.find('i', class_='glyphicon-new-window').parent['href']
class MODSAgent(Parser): schema = tools.RunPython('get_agent_schema', ctx) name = tools.OneOf(tools.RunPython(force_text, ctx['mods:displayForm']), tools.RunPython('squash_name_parts', ctx)) related_agents = tools.Map( tools.Delegate(IsAffiliatedWith), tools.Concat( tools.Try( tools.Filter( lambda x: bool(x), tools.RunPython(force_text, ctx['mods:affiliation']))))) identifiers = tools.Map( tools.Delegate(MODSAgentIdentifier), tools.Unique( tools.Map( tools.Try(tools.IRI(), exceptions=(ValueError, )), tools.Map( tools.RunPython(force_text), tools.Filter( lambda obj: 'invalid' not in obj, tools.Try(ctx['mods:nameIdentifier']), ))))) class Extra: name_type = tools.Try(ctx['@type']) name_part = tools.Try(ctx['mods:namePart']) affiliation = tools.Try(ctx['mods:affiliation']) description = tools.Try(ctx['mods:description']) display_form = tools.Try(ctx['mods:displayForm']) etal = tools.Try(ctx['mods:etal']) name_identifier = tools.Try(ctx['mods:nameIdentifier']) def squash_name_parts(self, name): name_parts = get_list(name, 'mods:namePart') return ' '.join([force_text(n) for n in name_parts]) def get_agent_schema(self, obj): name_type = obj.get('@type') if name_type == 'personal': return 'person' if name_type == 'conference': return 'organization' # TODO SHARE-718 # if name_type == 'family': # return 'family' if name_type == 'corporate': return GuessAgentTypeLink(default='organization').execute( self.squash_name_parts(obj)) return GuessAgentTypeLink().execute(self.squash_name_parts(obj))
class ContributorAgent(Parser): schema = tools.OneOf( tools.GuessAgentType(tools.RunPython(get_agent_type, ctx, person=False), default='organization'), tools.GuessAgentType(tools.OneOf(ctx.creatorName, ctx.contributorName))) name = tools.OneOf(ctx.creatorName, ctx.contributorName) identifiers = tools.Map( tools.Delegate(AgentIdentifier), tools.Try(tools.Map(tools.IRI(ctx), tools.RunPython(force_text, ctx.nameIdentifier)), exceptions=(ValueError, ))) related_agents = tools.Map( tools.Delegate(IsAffiliatedWith), tools.Concat( tools.Try( tools.Filter(lambda x: bool(x), tools.RunPython(force_text, ctx.affiliation))))) class Extra: name_identifier = tools.Try(ctx.nameIdentifier) name_identifier_scheme = tools.Try( ctx.nameIdentifier['@nameIdentifierScheme']) name_identifier_scheme_uri = tools.Try( ctx.nameIdentifier['@schemeURI']) contributor_type = tools.Try(ctx.contributorType) # v.4 new givenName and familyName properties given_name = tools.OneOf(ctx.creatorName['@givenName'], ctx.contributorName['@givenName'], tools.Static(None)) family_name = tools.OneOf(ctx.creatorName['@familyName'], ctx.contributorName['@familyName'], tools.Static(None))
class OAICreativeWork(Parser): default_type = None type_map = None schema = tools.RunPython( 'get_schema', tools.OneOf(ctx.record.metadata.dc['dc:type'], tools.Static(None))) title = tools.Join( tools.RunPython(force_text, tools.Try(ctx.record.metadata.dc['dc:title']))) description = tools.Join( tools.RunPython(force_text, tools.Try(ctx.record.metadata.dc['dc:description']))) identifiers = tools.Map( tools.Delegate(OAIWorkIdentifier), tools.Unique( tools.Map( tools.Try(tools.IRI(), exceptions=(InvalidIRI, )), tools.Filter( not_citation, tools.RunPython( force_text, tools.Concat( tools.Try(ctx.record.metadata.dc['dc:identifier']), tools.Try(ctx.record.header['identifier']))))))) related_works = tools.Concat( tools.Map( tools.Delegate(OAIWorkRelation), tools.Unique( tools.Map(tools.Try(tools.IRI(), exceptions=(InvalidIRI, )), tools.RunPython('get_relation', ctx))))) related_agents = tools.Concat( tools.Map(tools.Delegate(OAICreator), tools.Try(ctx.record.metadata.dc['dc:creator'])), tools.Map(tools.Delegate(OAIContributor), tools.Try(ctx.record.metadata.dc['dc:contributor'])), tools.Map( tools.Delegate(OAIPublisher), tools.RunPython(force_text, tools.Try( ctx.record.metadata.dc['dc:publisher']))), ) rights = tools.Join(tools.Try(ctx.record.metadata.dc['dc:rights'])) # Note: this is only taking the first language in the case of multiple languages language = tools.ParseLanguage( tools.Try(ctx.record.metadata.dc['dc:language'][0]), ) subjects = tools.Map( tools.Delegate(OAIThroughSubjects), tools.Subjects( tools.Map( tools.RunPython('tokenize'), tools.RunPython( force_text, tools.Concat( tools.Try(ctx.record.header.setSpec), tools.Try(ctx.record.metadata.dc['dc:type']), tools.Try(ctx.record.metadata.dc['dc:format']), tools.Try(ctx.record.metadata.dc['dc:subject']), ))))) tags = tools.Map( tools.Delegate(OAIThroughTags), tools.Concat(tools.Map( tools.RunPython('tokenize'), tools.RunPython( force_text, tools.Concat( tools.Try(ctx.record.header.setSpec), tools.Try(ctx.record.metadata.dc['dc:type']), tools.Try(ctx.record.metadata.dc['dc:format']), tools.Try(ctx.record.metadata.dc['dc:subject']), ))), deep=True)) date_updated = tools.ParseDate(ctx.record.header.datestamp) is_deleted = tools.RunPython('check_status', tools.Try(ctx.record.header['@status'])) class Extra: """ Fields that are combined in the base parser are relisted as singular elements that match their original entry to preserve raw data structure. """ # An agent responsible for making contributions to the resource. contributor = tools.Try(ctx.record.metadata.dc['dc:contributor']) # The spatial or temporal topic of the resource, the spatial applicability of the resource, # or the jurisdiction under which the resource is relevant. coverage = tools.Try(ctx.record.metadata.dc['dc:coverage']) # An agent primarily responsible for making the resource. creator = tools.Try(ctx.record.metadata.dc['dc:creator']) # A point or period of time associated with an event in the lifecycle of the resource. dates = tools.Try(ctx.record.metadata.dc['dc:date']) # The file format, physical medium, or dimensions of the resource. resource_format = tools.Try(ctx.record.metadata.dc['dc:format']) # An unambiguous reference to the resource within a given context. identifiers = tools.Concat( tools.Try(ctx.record.metadata.dc['dc:identifier']), tools.Try(ctx.record.header['identifier'])) # A related resource. relation = tools.RunPython('get_relation', ctx) # A related resource from which the described resource is derived. source = tools.Try(ctx.record.metadata.dc['dc:source']) # The nature or genre of the resource. resource_type = tools.Try(ctx.record.metadata.dc['dc:type']) set_spec = tools.Try(ctx.record.header.setSpec) # Language also stored in the Extra class in case the language reported cannot be parsed by ParseLanguage language = tools.Try(ctx.record.metadata.dc['dc:language']) # Status in the header, will exist if the resource is deleted status = tools.Try(ctx.record.header['@status']) def check_status(self, status): if status == 'deleted': return True return False def get_schema(self, types): if not types or not self.type_map: return self.default_type if isinstance(types, str): types = [types] for t in types: if isinstance(t, dict): t = t['#text'] t = t.lower() if t in self.type_map: return self.type_map[t] return self.default_type def tokenize(self, data): if isinstance(data, str): data = [data] tokens = [] for item in data: tokens.extend( [x.strip() for x in re.split(r'(?: - )|\.|,', item) if x]) return tokens def get_relation(self, ctx): if not ctx['record'].get('metadata'): return [] relation = ctx['record']['metadata']['dc'].get('dc:relation') or [] identifiers = ctx['record']['metadata']['dc'].get( 'dc:identifier') or [] if isinstance(identifiers, dict): identifiers = (identifiers, ) identifiers = ''.join(i['#text'] if isinstance(i, dict) else i for i in identifiers if i) identifiers = re.sub( 'http|:|/', '', identifiers + ctx['record']['header']['identifier']) if isinstance(relation, dict): relation = (relation['#text'], ) return [ r for r in relation if r and re.sub('http|:|/', '', r) not in identifiers ]
class MODSCreativeWork(Parser): default_type = 'CreativeWork' type_map = None role_map = None schema = tools.RunPython( 'get_schema', tools.OneOf(tools.RunPython(force_text, ctx['mods:genre']), tools.Static(None))) title = tools.RunPython('join_title_info', ctx) # Abstracts have the optional attribute "shareable". Don't bother checking for it, because # abstracts that are not shareable should not have been shared with SHARE. description = tools.Join( tools.RunPython(force_text, tools.Try(ctx['mods:abstract']), '\n')) identifiers = tools.Map( tools.Delegate(MODSWorkIdentifier), tools.Unique( tools.Map( tools.Try(tools.IRI(), exceptions=(ValueError, )), tools.Map( tools.RunPython(force_text), tools.Filter( lambda obj: 'invalid' not in obj, tools.Concat( tools.Try(ctx['mods:identifier']), tools.Try(ctx.header['identifier']), tools.Try(ctx['mods:location']['mods:url']), )))))) related_works = tools.Concat( tools.Map(tools.Delegate(MODSWorkRelation), tools.Try(ctx['mods:relatedItem']))) related_agents = tools.Concat( tools.Map(tools.Delegate(MODSCreator), tools.RunPython('filter_names', ctx, 'creator')), tools.Map(tools.Delegate(MODSFunder), tools.RunPython('filter_names', ctx, 'funder')), tools.Map(tools.Delegate(MODSHost), tools.RunPython('filter_names', ctx, 'host')), tools.Map(tools.Delegate(MODSPublisher), tools.RunPython('filter_names', ctx, 'publisher')), tools.Map( tools.Delegate(MODSContributor), tools.RunPython('filter_names', ctx, 'creator', 'funder', 'host', 'publisher', invert=True)), tools.Map( tools.Delegate(MODSSimplePublisher), tools.Try(ctx['mods:originInfo']['mods:publisher']), ), ) rights = tools.RunPython(force_text, tools.Try(ctx['mods:accessCondition']), '\n') language = tools.ParseLanguage( tools.Try(ctx['mods:language']['mods:languageTerm']), ) subjects = tools.Map( tools.Delegate(MODSThroughSubjects), tools.Subjects( tools.Concat(tools.Try(ctx['mods:subject']['mods:topic']), ))) tags = tools.Map( tools.Delegate(MODSThroughTags), tools.Concat(tools.Map( tools.RunPython('tokenize'), tools.Map( tools.RunPython(force_text), tools.Try(ctx.header.setSpec), tools.Try(ctx['mods:genre']), tools.Try(ctx['mods:classification']), tools.Try(ctx['mods:subject']['mods:topic']), )), deep=True)) date_updated = tools.ParseDate(tools.Try(ctx.header.datestamp)) # TODO (in regulator) handle date ranges, uncertain dates ('1904-1941', '1890?', '1980-', '19uu', etc.) date_published = tools.OneOf( tools.ParseDate( tools.RunPython( force_text, tools.Try(ctx['mods:originInfo']['mods:dateIssued']))), tools.Static(None)) is_deleted = tools.RunPython(lambda status: status == 'deleted', tools.Try(ctx.record.header['@status'])) class Extra: """ Fields that are combined in the base parser are relisted as singular elements that match their original entry to preserve raw data structure. """ # (dc:description) http://www.loc.gov/standards/mods/userguide/abstract.html abstract = tools.Try(ctx['mods:abstract']) # (dc:rights) http://www.loc.gov/standards/mods/userguide/accesscondition.html accessConditions = tools.Try(ctx['mods:accessCondition']) # (dc:subject) http://www.loc.gov/standards/mods/userguide/classification.html classification = tools.Try(ctx['mods:classification']) # (N/A) http://www.loc.gov/standards/mods/userguide/extension.html extension = tools.Try(ctx['mods:extension']) # SHARE type # (dc:type) http://www.loc.gov/standards/mods/userguide/genre.html genre = tools.Try(ctx['mods:genre']) # (dc:identifier) http://www.loc.gov/standards/mods/userguide/identifier.html identifier = tools.Try(ctx['mods:identifier']) # (dc:language) http://www.loc.gov/standards/mods/userguide/language.html language = tools.Try(ctx['mods:language']) # (dc:identifier for url) http://www.loc.gov/standards/mods/userguide/location.html location = tools.Try(ctx['mods:location']) # (dc:creator|dc:contributor) http://www.loc.gov/standards/mods/userguide/name.html name = tools.Try(ctx['mods:name']) # (dc:description) http://www.loc.gov/standards/mods/userguide/note.html note = tools.Try(ctx['mods:note']) # (dc:publisher|dc:date) http://www.loc.gov/standards/mods/userguide/origininfo.html originInfo = tools.Try(ctx['mods:originInfo']) # Extra # (dc:title) http://www.loc.gov/standards/mods/userguide/part.html part = tools.Try(ctx['mods:part']) # (dc:format or N/A) http://www.loc.gov/standards/mods/userguide/physicaldescription.html physicalDescription = tools.Try(ctx['mods:physicalDescription']) # Metadata information # (N/A) http://www.loc.gov/standards/mods/userguide/recordinfo.html recordInfo = tools.Try(ctx['mods:recordInfo']) # (dc:relation) http://www.loc.gov/standards/mods/userguide/relateditem.html relatedItem = tools.Try(ctx['mods:relatedItem']) # (dc:subject|dc:type|dc:coverage|N/A) http://www.loc.gov/standards/mods/userguide/subject.html subject = tools.Try(ctx['mods:subject']) # (dc:description) http://www.loc.gov/standards/mods/userguide/tableofcontents.html tableOfContents = tools.Try(ctx['mods:tableOfContents']) # (N/A) http://www.loc.gov/standards/mods/userguide/targetaudience.html targetAudience = tools.Try(ctx['mods:targetAudience']) # (dc:title) http://www.loc.gov/standards/mods/userguide/titleinfo.html titleInfo = tools.Try(ctx['mods:titleInfo']) # Extra # (dc:type) http://www.loc.gov/standards/mods/userguide/typeofresource.html typeOfResource = tools.Try(ctx['mods:typeOfResource']) def get_schema(self, types): if not types or not self.type_map: return self.default_type if isinstance(types, str): types = [types] for t in types: if isinstance(t, dict): t = t['#text'] t = t.lower() if t in self.type_map: return self.type_map[t] return self.default_type def tokenize(self, data): if isinstance(data, str): data = [data] tokens = [] for item in data: tokens.extend( [x.strip() for x in re.split(r'(?: - )|\.|,', item) if x]) return tokens # Map titleInfos to a string: https://www.loc.gov/standards/mods/userguide/titleinfo.html#mappings def join_title_info(self, obj): def get_part(title_info, part_name, delimiter=''): part = force_text(title_info.get(part_name, ''), ' ').strip() return delimiter + part if part else '' title_infos = get_list(obj, 'mods:titleInfo') titles = [] for title_info in title_infos: title = '' title += get_part(title_info, 'mods:nonSort') title += get_part(title_info, 'mods:title') title += get_part(title_info, 'mods:subTitle', ': ') title += get_part(title_info, 'mods:partNumber', '. ') title += get_part(title_info, 'mods:partName', ': ') if title: titles.append(title) return '. '.join(titles) def filter_names(self, obj, *roles, invert=False): names = get_list(obj, 'mods:name') filtered = [*names] if invert else [] for name in names: name_roles = get_list(name, 'mods:role') for role in name_roles: role_terms = get_list(role, 'mods:roleTerm') name_roles = {force_text(r).lower() for r in role_terms} name_roles.update({ self.role_map[r] for r in name_roles if r in self.role_map }) if name_roles.intersection(roles): if invert: filtered.remove(name) else: filtered.append(name) return filtered
class ContributorRelation(Parser): schema = 'Contributor' agent = tools.Delegate(ContributorAgent, ctx) cited_as = tools.OneOf(ctx.creatorName, ctx.contributorName)
class Contributor(Parser): agent = tools.Delegate(Person, ctx) cited_as = tools.OneOf( ctx.embeds.users.data.attributes.full_name, ctx.embeds.users.errors[0].meta.full_name, )