class Preprint(Parser): title = ctx.entry.title description = ctx.entry.summary date_published = tools.ParseDate(ctx.entry.published) date_updated = tools.ParseDate(ctx.entry.updated) # free_to_read_type # free_to_read_date # rights # language subjects = tools.Map( tools.Delegate(ThroughSubjects), tools.Subjects(tools.Map(ctx['@term'], ctx.entry.category)), ) tags = tools.Map( tools.Delegate(ThroughTags), tools.Map(ctx['@term'], ctx.entry.category), ) related_agents = tools.Concat( tools.Map(tools.Delegate(Creator), ctx.entry.author), ) # related_works identifiers = tools.Map(tools.Delegate(WorkIdentifier), tools.Try(ctx.entry['arxiv:doi']), ctx.entry.id) class Extra: resource_id = ctx.entry.id journal_ref = tools.Try(ctx.entry['arxiv:journal_ref']) comment = tools.Try(ctx.entry['arxiv:comment']) primary_category = tools.Try(ctx.entry['arxiv:primary_category'])
class Preprint(Parser): title = ctx.entry.title description = ctx.entry.summary date_published = tools.ParseDate(ctx.entry.published) date_updated = tools.ParseDate(ctx.entry.updated) contributors = tools.Map(tools.Delegate(Contributor), ctx.entry.author) links = tools.Map(tools.Delegate(ThroughLinks), tools.Try(ctx.entry['arxiv:doi']), ctx.entry.id) subjects = tools.Map( tools.Delegate(ThroughSubjects), tools.Subjects( tools.RunPython('get_subjects', tools.Concat(tools.Try(ctx.entry.category))))) tags = tools.Map(tools.Delegate(ThroughTags), ctx.entry.category) class Extra: resource_id = ctx.entry.id journal_ref = tools.Try(ctx.entry['arxiv:journal_ref']) comment = tools.Try(ctx.entry['arxiv:comment']) primary_category = tools.Try(ctx.entry['arxiv:primary_category']) def get_subjects(self, link): return list(map((lambda category: category['@term']), link))
class CreativeWork(Parser): title = ctx.title description = tools.Try(ctx.description) is_deleted = tools.RunPython('_is_deleted', tools.Try(ctx.otherProperties)) date_updated = tools.ParseDate(tools.Try(ctx.providerUpdatedDateTime)) rights = tools.Join(tools.Try(ctx.licenses.uri)) # Note: this is only taking the first language in the case of multiple languages language = tools.ParseLanguage(tools.Try(ctx.languages[0]), ) related_agents = tools.Concat( tools.Map(tools.Delegate(Creator), tools.Try(ctx.contributors)), tools.Map(tools.Delegate(Publisher), tools.Try(ctx.publisher)), tools.Map(tools.Delegate(Funder), tools.Try(ctx.sponsorships))) identifiers = tools.Map( tools.Delegate(WorkIdentifier), tools.Map( tools.IRI(), tools.RunPython( 'unique', tools.Concat(tools.Try(ctx.uris.canonicalUri), tools.Try(ctx.uris.providerUris), tools.Try(ctx.uris.descriptorUris), tools.Try(ctx.uris.objectUris))))) subjects = tools.Map(tools.Delegate(ThroughSubjects), tools.Subjects(tools.Try(ctx.subjects))) tags = tools.Map(tools.Delegate(ThroughTags), tools.Try(ctx.tags), tools.Try(ctx.subjects)) class Extra: """ Fields that are combined in the base parser are relisted as singular elements that match their original entry to preserve raw data structure. """ freeToRead = tools.Try(ctx.freeToRead) languages = tools.Try(ctx.languages) licenses = tools.Try(ctx.licenses) otherProperties = tools.Try(ctx.otherProperties) publisher = tools.Try(ctx.publisher) subjects = tools.Try(ctx.subjects) sponsorships = tools.Try(ctx.sponsorships) tags = tools.Try(ctx.tags) uris = tools.Try(ctx.uris) version = tools.Try(ctx.version) def unique(self, items): return list(sorted(set(items))) def _is_deleted(self, properties): for prop in properties or []: if prop['name'] == 'status': return 'deleted' in prop['properties'].get('status', []) return False
class Preprint(Parser): title = tools.Try(ctx['DC.Title']) description = tools.Try(ctx['DC.Description']) # is_deleted date_published = tools.ParseDate(tools.Try(ctx['article:published_time'])) date_updated = tools.ParseDate(tools.Try(ctx['DC.Date'])) # free_to_read_type # free_to_read_date rights = tools.Try(ctx['DC.Rights']) language = tools.Try(ctx['DC.Language']) subjects = tools.Map(tools.Delegate(ThroughSubjects), tools.Static('Biology'), tools.Subjects(tools.Try(ctx['subject-areas']))) tags = tools.Map(tools.Delegate(ThroughTags), tools.Try(ctx['category']), tools.Try(ctx['subject-areas'])) identifiers = tools.Map(tools.Delegate(WorkIdentifier), tools.Try(ctx['og:url']), ctx['citation_public_url'], ctx['citation_doi']) related_agents = tools.Concat( tools.Map(tools.Delegate(Publisher), tools.Try(ctx['DC.Publisher'])), tools.Map(tools.Delegate(Creator), tools.RunPython('get_contributors', ctx))) # related_works class Extra: identifiers = ctx['DC.Identifier'] access_rights = ctx['DC.AccessRights'] def get_contributors(self, link): authors = link.get('citation_author', []) if isinstance( link.get('citation_author', []), list) else [link['citation_author']] institutions = link.get( 'citation_author_institution', []) if isinstance( link.get('citation_author_institution', []), list) else [link['citation_author_institution']] emails = link.get('citation_author_email', []) if isinstance( link.get('citation_author_email', []), list) else [link['citation_author_email']] contribs = [] for author, email, institution in itertools.zip_longest( authors, emails, institutions): contrib = { 'author': author, 'institution': institution, 'email': email, } contribs.append(contrib) return contribs
class OAICreativeWork(Parser): default_type = None type_map = None schema = tools.RunPython( 'get_schema', tools.OneOf(ctx.record.metadata.dc['dc:type'], tools.Static(None))) title = tools.Join( tools.RunPython('force_text', tools.Try(ctx.record.metadata.dc['dc:title']))) description = tools.Join( tools.RunPython('force_text', tools.Try(ctx.record.metadata.dc['dc:description']))) identifiers = tools.Map( tools.Delegate(OAIWorkIdentifier), tools.Unique( tools.Map( tools.Try(tools.IRI(), exceptions=(ValueError, )), tools.Filter( not_citation, tools.RunPython( 'force_text', tools.Concat( tools.Try(ctx.record.metadata.dc['dc:identifier']), tools.Try(ctx.record.header['identifier']))))))) related_works = tools.Concat( tools.Map( tools.Delegate(OAIWorkRelation), tools.Unique( tools.Map(tools.Try(tools.IRI(), exceptions=(ValueError, )), tools.RunPython('get_relation', ctx))))) related_agents = tools.Concat( tools.Map(tools.Delegate(OAICreator), tools.Try(ctx.record.metadata.dc['dc:creator'])), tools.Map(tools.Delegate(OAIContributor), tools.Try(ctx.record.metadata.dc['dc:contributor'])), tools.Map( tools.Delegate(OAIPublisher), tools.RunPython('force_text', tools.Try( ctx.record.metadata.dc['dc:publisher']))), ) rights = tools.Join(tools.Try(ctx.record.metadata.dc['dc:rights'])) # Note: this is only taking the first language in the case of multiple languages language = tools.ParseLanguage( tools.Try(ctx.record.metadata.dc['dc:language'][0]), ) subjects = tools.Map( tools.Delegate(OAIThroughSubjects), tools.Subjects( tools.Map( tools.RunPython('tokenize'), tools.RunPython( 'force_text', tools.Concat( tools.Try(ctx.record.header.setSpec), tools.Try(ctx.record.metadata.dc['dc:type']), tools.Try(ctx.record.metadata.dc['dc:format']), tools.Try(ctx.record.metadata.dc['dc:subject']), ))))) tags = tools.Map( tools.Delegate(OAIThroughTags), tools.Concat(tools.Map( tools.RunPython('tokenize'), tools.RunPython( 'force_text', tools.Concat( tools.Try(ctx.record.header.setSpec), tools.Try(ctx.record.metadata.dc['dc:type']), tools.Try(ctx.record.metadata.dc['dc:format']), tools.Try(ctx.record.metadata.dc['dc:subject']), ))), deep=True)) date_updated = tools.ParseDate(ctx.record.header.datestamp) is_deleted = tools.RunPython('check_status', tools.Try(ctx.record.header['@status'])) class Extra: """ Fields that are combined in the base parser are relisted as singular elements that match their original entry to preserve raw data structure. """ # An agent responsible for making contributions to the resource. contributor = tools.Try(ctx.record.metadata.dc['dc:contributor']) # The spatial or temporal topic of the resource, the spatial applicability of the resource, # or the jurisdiction under which the resource is relevant. coverage = tools.Try(ctx.record.metadata.dc['dc:coverage']) # An agent primarily responsible for making the resource. creator = tools.Try(ctx.record.metadata.dc['dc:creator']) # A point or period of time associated with an event in the lifecycle of the resource. dates = tools.Try(ctx.record.metadata.dc['dc:date']) # The file format, physical medium, or dimensions of the resource. resource_format = tools.Try(ctx.record.metadata.dc['dc:format']) # An unambiguous reference to the resource within a given context. identifiers = tools.Concat( tools.Try(ctx.record.metadata.dc['dc:identifier']), tools.Try(ctx.record.header['identifier'])) # A related resource. relation = tools.RunPython('get_relation', ctx) # A related resource from which the described resource is derived. source = tools.Try(ctx.record.metadata.dc['dc:source']) # The nature or genre of the resource. resource_type = tools.Try(ctx.record.metadata.dc['dc:type']) set_spec = tools.Try(ctx.record.header.setSpec) # Language also stored in the Extra class in case the language reported cannot be parsed by ParseLanguage language = tools.Try(ctx.record.metadata.dc['dc:language']) # Status in the header, will exist if the resource is deleted status = tools.Try(ctx.record.header['@status']) def check_status(self, status): if status == 'deleted': return True return False def get_schema(self, types): if not types or not self.type_map: return self.default_type if isinstance(types, str): types = [types] for t in types: if isinstance(t, dict): t = t['#text'] t = t.lower() if t in self.type_map: return self.type_map[t] return self.default_type def force_text(self, data): if isinstance(data, dict): return data['#text'] if isinstance(data, str): return data fixed = [] for datum in (data or []): if datum is None: continue if isinstance(datum, dict): if '#text' not in datum: logger.warn('Skipping %s, no #text key exists', datum) continue fixed.append(datum['#text']) elif isinstance(datum, str): fixed.append(datum) else: raise Exception(datum) return fixed def tokenize(self, data): if isinstance(data, str): data = [data] tokens = [] for item in data: tokens.extend( [x.strip() for x in re.split(r'(?: - )|\.|,', item) if x]) return tokens def get_relation(self, ctx): if not ctx['record'].get('metadata'): return [] relation = ctx['record']['metadata']['dc'].get('dc:relation') or [] identifiers = ctx['record']['metadata']['dc'].get( 'dc:identifier') or [] if isinstance(identifiers, dict): identifiers = (identifiers, ) identifiers = ''.join(i['#text'] if isinstance(i, dict) else i for i in identifiers if i) identifiers = re.sub( 'http|:|/', '', identifiers + ctx['record']['header']['identifier']) if isinstance(relation, dict): relation = (relation['#text'], ) return [ r for r in relation if r and re.sub('http|:|/', '', r) not in identifiers ]
class OAICreativeWork(Parser): schema = 'CreativeWork' ORGANIZATION_KEYWORDS = ('the', 'center') INSTITUTION_KEYWORDS = ('school', 'university', 'institution', 'institute') title = tools.Join( tools.RunPython('force_text', tools.Try( ctx['record']['metadata']['dc']['dc:title']))) description = tools.Join( tools.RunPython('force_text', tools.Try(ctx.record.metadata.dc['dc:description']))) publishers = tools.Map( tools.Delegate( OAIAssociation.using(entity=tools.Delegate(OAIPublisher))), tools.Map(tools.RunPython('force_text'), tools.Try(ctx.record.metadata.dc['dc:publisher']))) rights = tools.Join( tools.Maybe(tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:rights')) # Note: this is only taking the first language in the case of multiple languages language = tools.ParseLanguage( tools.Try(ctx['record']['metadata']['dc']['dc:language'][0]), ) contributors = tools.Map( tools.Delegate(OAIContributor), tools.RunPython( 'get_contributors', tools.Concat( tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:creator'), tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:contributor')), 'contributor')) institutions = tools.Map( tools.Delegate( OAIAssociation.using(entity=tools.Delegate(OAIInstitution))), tools.RunPython( 'get_contributors', tools.Concat( tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:creator'), tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:contributor')), 'institution')) organizations = tools.Map( tools.Delegate( OAIAssociation.using(entity=tools.Delegate(OAIOrganization))), tools.RunPython( 'get_contributors', tools.Concat( tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:creator'), tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:contributor')), 'organization')) subjects = tools.Map( tools.Delegate(OAIThroughSubjects), tools.Subjects( tools.Map( tools.RunPython('tokenize'), tools.Try(ctx['record']['header']['setSpec']), tools.Try(ctx['record']['metadata']['dc']['dc:type']), tools.Try(ctx['record']['metadata']['dc']['dc:format']), tools.Try(ctx['record']['metadata']['dc']['dc:subject']), ))) tags = tools.Map( tools.Delegate(OAIThroughTags), tools.RunPython( 'force_text', tools.Concat( tools.Try(ctx['record']['header']['setSpec']), tools.Try(ctx['record']['metadata']['dc']['dc:type']), tools.Try(ctx['record']['metadata']['dc']['dc:format']), tools.Try(ctx['record']['metadata']['dc']['dc:subject']), ))) links = tools.Map( tools.Delegate(OAIThroughLinks), tools.RunPython( 'get_links', tools.Concat( tools.Try(ctx['record']['metadata']['dc']['dc:identifier']), tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:relation')))) date_updated = tools.ParseDate(ctx['record']['header']['datestamp']) class Extra: """ Fields that are combined in the base parser are relisted as singular elements that match their original entry to preserve raw data structure. """ # An entity responsible for making contributions to the resource. contributor = tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:contributor') # The spatial or temporal topic of the resource, the spatial applicability of the resource, # or the jurisdiction under which the resource is relevant. coverage = tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:coverage') # An entity primarily responsible for making the resource. creator = tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:creator') # A point or period of time associated with an event in the lifecycle of the resource. dates = tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:date') # The file format, physical medium, or dimensions of the resource. resource_format = tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:format') # An unambiguous reference to the resource within a given context. identifiers = tools.Concat( tools.Try(ctx['record']['metadata']['dc']['dc:identifier']), tools.Maybe(ctx['record']['header'], 'identifier')) # A related resource. relation = tools.RunPython('get_relation', ctx) # A related resource from which the described resource is derived. source = tools.Maybe( tools.Maybe(ctx['record'], 'metadata')['dc'], 'dc:source') # The nature or genre of the resource. resource_type = tools.Try(ctx.record.metadata.dc['dc:type']) set_spec = tools.Maybe(ctx.record.header, 'setSpec') # Language also stored in the Extra class in case the language reported cannot be parsed by ParseLanguage language = tools.Try(ctx.record.metadata.dc['dc:language']) # Status in the header, will exist if the resource is deleted status = tools.Maybe(ctx.record.header, '@status') def get_links(self, ctx): links = [] for link in ctx: if not link or not isinstance(link, str): continue found_url = URL_REGEX.search(link) if found_url is not None: links.append(found_url.group()) continue found_doi = DOI_REGEX.search(link) if found_doi is not None: found_doi = found_doi.group() if 'dx.doi.org' in found_doi: links.append(found_doi) else: links.append('http://dx.doi.org/{}'.format( found_doi.replace('doi:', ''))) return links def force_text(self, data): if isinstance(data, dict): return data['#text'] if isinstance(data, str): return data fixed = [] for datum in (data or []): if datum is None: continue if isinstance(datum, dict): if '#text' not in datum: logger.warn('Skipping %s, no #text key exists', datum) continue fixed.append(datum['#text']) elif isinstance(datum, str): fixed.append(datum) else: raise Exception(datum) return fixed def tokenize(self, data): tokens = [] for item in data: tokens.extend( [x.strip() for x in re.split('(?: - )|\.', data) if x]) return tokens def get_relation(self, ctx): if not ctx['record'].get('metadata'): return [] relation = ctx['record']['metadata']['dc'].get('dc:relation', []) if isinstance(relation, dict): return relation['#text'] return relation def get_contributors(self, options, entity): """ Returns list of organization, institutions, or contributors names based on entity type. """ options = [o if isinstance(o, str) else o['#text'] for o in options] if entity == 'organization': organizations = [ value for value in options if (value and not self.list_in_string(value, self.INSTITUTION_KEYWORDS) and self.list_in_string(value, self.ORGANIZATION_KEYWORDS)) ] return organizations elif entity == 'institution': institutions = [ value for value in options if (value and self.list_in_string(value, self.INSTITUTION_KEYWORDS)) ] return institutions elif entity == 'contributor': people = [ value for value in options if (value and not self.list_in_string( value, self.INSTITUTION_KEYWORDS) and not self.list_in_string(value, self.ORGANIZATION_KEYWORDS)) ] return people else: return options def list_in_string(self, string, list_): if any(word in string.lower() for word in list_): return True return False
class CreativeWork(Parser): ''' Documentation for Datacite's metadata: https://schema.labs.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf ''' def get_schema(self, type): return { 'dataset': 'DataSet', 'software': 'Software', 'text/book': 'Book', 'text/book chapter': 'Book', 'text/book prospectus': 'Book', 'text/book series': 'Book', 'text/conference abstract': 'ConferencePaper', 'text/conference paper': 'ConferencePaper', 'text/conference poster': 'Poster', 'text/dissertation': 'Dissertation', 'text/edited book': 'Book', 'text/journal article': 'Article', 'text/journal issue': 'Article', 'text/patent': 'Patent', 'text/report': 'Report', 'text/supervised student publication': 'Thesis', 'text/working paper': 'WorkingPaper' # 'audiovisual': '', # 'collection': '', # 'event': '', # 'image': '', # 'interactiveresource': '', # 'model': '', # 'physicalobject': '', # 'service': '', # 'sound': '', # 'text15': '', # 'workflow': '', # 'text/book review': '', # 'text/conference program': '', # 'text/dictionary entry': '', # 'text/disclosure': '', # 'text/encyclopedia entry': '', # 'text/Funding submission': '', # 'text/license': '', # 'text/magazine article': '', # 'text/manual': '', # 'text/newsletter article': '', # 'text/newspaper article': '', # 'text/online resource': '', # 'text/registered copyright': '', # 'text/research tool': '', # 'text/tenure-promotion': '', # 'text/test': '', # 'text/trademark': '', # 'text/translation': '', # 'text/university academic unit': '', # 'text/website': '', }.get(type.lower()) or 'CreativeWork' schema = tools.RunPython( 'get_schema', tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. resourceType['@resourceTypeGeneral'], default='CreativeWork')) title = tools.RunPython( force_text, tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.titles.title)) description = tools.RunPython( force_text, tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. descriptions.description[0])) rights = tools.Try( tools.Join( tools.RunPython( 'text_list', tools.Concat(ctx.record.metadata['oai_datacite'].payload. resource.rightsList.rights)))) language = tools.ParseLanguage( tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.language)) related_agents = tools.Concat( tools.Map( tools.Delegate(CreatorRelation), tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. creators.creator))), tools.Map( tools.Delegate(ContributorRelation), tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. contributors.contributor))), tools.Map( tools.Delegate(PublisherRelation), tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. publisher)), tools.Map( tools.Delegate(HostRelation), tools.RunPython( get_contributors, tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.contributors.contributor)), ['HostingInstitution'])), # v.3 Funder is a contributor type # v.4 FundingReference replaces funder contributor type tools.Map( tools.Delegate(FunderRelation), tools.RunPython( get_contributors, tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.contributors.contributor)), ['Funder'])), tools.Map( tools.Delegate(FunderRelation), tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. fundingReference)))) # v.4 New, free text, 'subjectScheme' attribute on subject subjects = tools.Map( tools.Delegate(ThroughSubjects), tools.Subjects( tools.RunPython( 'text_list', tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.subjects.subject), )))) tags = tools.Map( tools.Delegate(ThroughTags), tools.RunPython( force_text, tools.Concat( tools.Maybe( tools.Maybe(ctx.record, 'metadata')['oai_datacite'], 'type'), tools.RunPython('text_list', (tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.subjects.subject)))), tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. formats.format), tools.Try( ctx.record.metadata['oai_datacite'].datacentreSymbol), tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. resourceType['#text']), tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. resourceType['@resourceTypeGeneral']), tools.Maybe(ctx.record.header, 'setSpec'), tools.Maybe(ctx.record.header, '@status')))) identifiers = tools.Concat( tools.Map( tools.Delegate(WorkIdentifier), tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. identifier))), tools.Map( tools.Delegate(WorkIdentifier), tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload.resource. alternateIdentifiers.alternateidentifier)))) related_works = tools.Concat( tools.Map( tools.Delegate(WorkRelation), tools.RunPython( get_related_works, tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.relatedIdentifiers.relatedIdentifier)), False)), tools.Map( tools.Delegate(InverseWorkRelation), tools.RunPython( get_related_works, tools.Concat( tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.relatedIdentifiers.relatedIdentifier)), True))) date_updated = tools.ParseDate(tools.Try(ctx.record.header.datestamp)) date_published = tools.ParseDate( tools.Try( tools.RunPython( 'get_date_type', tools.Concat(ctx.record.metadata['oai_datacite'].payload. resource.dates.date), 'Issued'))) free_to_read_type = tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.rightsList.rights['@rightsURI']) free_to_read_date = tools.ParseDate( tools.Try( tools.RunPython( 'get_date_type', tools.Concat(ctx.record.metadata['oai_datacite'].payload. resource.dates.date), 'Available'))) is_deleted = tools.RunPython('check_status', tools.Try(ctx.record.header['@status'])) class Extra: """ Fields that are combined in the base parser are relisted as singular elements that match their original entry to preserve raw data structure. """ status = tools.Try(ctx.record.header['@status']) datestamp = tools.ParseDate(ctx.record.header.datestamp) set_spec = tools.Try(ctx.record.header.setSpec) is_reference_quality = tools.Try( ctx.record.metadata['oai_datacite'].isReferenceQuality) schema_version = tools.Try( ctx.record.metadata['oai_datacite'].schemaVersion) datacentre_symbol = tools.Try( ctx.record.metadata['oai_datacite'].datacentreSymbol) identifiers = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.identifier) alternate_identifiers = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource. alternateIdentifiers.alternateidentifier) titles = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.titles.title) publisher = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.publisher) publication_year = tools.Try(ctx.record.metadata['oai_datacite']. payload.resource.publicationYear) subject = tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.subjects.subject) resourceType = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.resourceType) sizes = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.size) format_type = tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.formats.format) version = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.version) rights = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.rights) rightsList = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.rightsList) related_identifiers = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource. relatedIdentifiers.relatedIdentifier) description = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.descriptions) dates = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.dates.date) contributors = tools.Try(ctx.record.metadata['oai_datacite'].payload. resource.contributors.contributor) creators = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.creators) # v.4 new property geoLocationPolygon, in addition to geoLocationPoint and geoLocationBox geolocations = tools.Try( ctx.record.metadata['oai_datacite'].payload.resource.geoLocations) funding_reference = tools.Try(ctx.record.metadata['oai_datacite']. payload.resource.fundingReference) def check_status(self, status): if status == 'deleted': return True return False def get_date_type(self, date_obj, date_type): date = None for obj in date_obj: if obj['@dateType'] == date_type: date = obj['#text'] if date and date != '0000': return date # raise KeyError to break TryLink raise KeyError() def text_list(self, data): text_list = [] if isinstance(data, list): for item in data: if isinstance(item, dict): if '#text' in item: text_list.append(item['#text']) continue elif isinstance(item, str): text_list.append(item) continue logger.warning( '#text is not in {} and it is not a string'.format(item)) return text_list else: raise Exception('{} is not a list.'.format(data))
class Preprint(OAICreativeWork): schema = 'preprint' subjects = tools.Map(tools.Delegate(ThroughSubjects), tools.Subjects(ctx.record.metadata.dc['dc:subject']))
class Preprint(OAIPreprint): subjects = tools.Map( tools.Delegate(ThroughSubjects), tools.Subjects(ctx.record.metadata.dc['dc:subject']) )