class Project(Parser): title = ctx.attributes.title description = ctx.attributes.description contributors = tools.Map(tools.Delegate(Contributor), ctx['contributors']) institutions = tools.Map( tools.Delegate(Association.using(entity=tools.Delegate(Institution))), ctx.embeds.affiliated_institutions.data ) date_updated = tools.ParseDate(ctx.attributes.date_modified) tags = tools.Map(tools.Delegate(ThroughTags), ctx.attributes.category, ctx.attributes.tags) rights = tools.Maybe(ctx, 'attributes.node_license') links = tools.Map(tools.Delegate(ThroughLinks), ctx.links.html) class Extra: date_created = tools.ParseDate(ctx.attributes.date_created) files = ctx.relationships.files.links.related.href parent = tools.Maybe(ctx, 'relationships.parent.links.related.href') forks = ctx.relationships.forks.links.related.href root = ctx.relationships.root.links.related.href comments = ctx.relationships.comments.links.related.href registrations = ctx.relationships.registrations.links.related.href logs = ctx.relationships.logs.links.related.href node_links = ctx.relationships.node_links.links.related.href wikis = ctx.relationships.wikis.links.related.href children = ctx.relationships.children.links.related.href fork = ctx.attributes.fork date_modified = ctx.attributes.date_modified collection = ctx.attributes.collection registration = ctx.attributes.registration type = ctx.type id = ctx.id
class Preprint(Parser): title = ctx.entry.title description = ctx.entry.summary date_published = tools.ParseDate(ctx.entry.published) date_updated = tools.ParseDate(ctx.entry.updated) # free_to_read_type # free_to_read_date # rights # language subjects = tools.Map( tools.Delegate(ThroughSubjects), tools.Subjects(tools.Map(ctx['@term'], ctx.entry.category)), ) tags = tools.Map( tools.Delegate(ThroughTags), tools.Map(ctx['@term'], ctx.entry.category), ) related_agents = tools.Concat( tools.Map(tools.Delegate(Creator), ctx.entry.author), ) # related_works identifiers = tools.Map(tools.Delegate(WorkIdentifier), tools.Try(ctx.entry['arxiv:doi']), ctx.entry.id) class Extra: resource_id = ctx.entry.id journal_ref = tools.Try(ctx.entry['arxiv:journal_ref']) comment = tools.Try(ctx.entry['arxiv:comment']) primary_category = tools.Try(ctx.entry['arxiv:primary_category'])
class Preprint(Parser): title = ctx.entry.title description = ctx.entry.summary date_published = tools.ParseDate(ctx.entry.published) date_updated = tools.ParseDate(ctx.entry.updated) contributors = tools.Map(tools.Delegate(Contributor), ctx.entry.author) links = tools.Map(tools.Delegate(ThroughLinks), tools.Try(ctx.entry['arxiv:doi']), ctx.entry.id) subjects = tools.Map( tools.Delegate(ThroughSubjects), tools.Subjects( tools.RunPython('get_subjects', tools.Concat(tools.Try(ctx.entry.category))))) tags = tools.Map(tools.Delegate(ThroughTags), ctx.entry.category) class Extra: resource_id = ctx.entry.id journal_ref = tools.Try(ctx.entry['arxiv:journal_ref']) comment = tools.Try(ctx.entry['arxiv:comment']) primary_category = tools.Try(ctx.entry['arxiv:primary_category']) def get_subjects(self, link): return list(map((lambda category: category['@term']), link))
class CreativeWork(Parser): title = tools.Join(ctx.record.metadata['DIF']['Entry_Title']) description = tools.Maybe(ctx.record.metadata['DIF']['Summary'], 'Abstract') organizations = tools.Map( tools.Delegate(Association.using(entity=tools.Delegate(Organization))), tools.Maybe(ctx.record.metadata['DIF'], 'Data_Center') ) tags = tools.Map( tools.Delegate(ThroughTags), tools.Maybe(ctx.record.metadata['DIF'], 'Metadata_Name'), tools.Maybe(ctx.record.header, 'setSpec') ) links = tools.Map( tools.Delegate(ThroughLinks), tools.Maybe(ctx.record.metadata['DIF'], 'Related_URL') ) date_updated = tools.ParseDate(ctx.record.header.datestamp) class Extra: entry_id = ctx.record.metadata['DIF']['Entry_ID'] metadata_name = tools.Maybe(ctx.record.metadata['DIF'], 'Metadata_Name') metadata_version = tools.Maybe(ctx.record.metadata['DIF'], 'Metadata_Version') last_dif_revision_date = tools.Maybe(ctx.record.metadata['DIF'], 'Last_DIF_Revision_Date') set_spec = tools.Maybe(ctx.record.header, 'setSpec')
class CreativeWork(Parser): schema = tools.RunPython('get_type', ctx) title = tools.RunPython('get_title', ctx) description = Soup(ctx, 'p', class_='genericfile_description')['#text'] date_published = tools.ParseDate( Soup(ctx, itemprop='datePublished')['#text']) date_updated = tools.ParseDate(Soup(ctx, itemprop='dateModified')['#text']) rights = tools.OneOf(tools.RunPython('get_rights_url', ctx), tools.RunPython('get_dd', ctx, 'Rights')['#text'], tools.Static(None)) language = tools.Try( tools.ParseLanguage(Soup(ctx, itemprop='inLanguage')['#text'])) tags = tools.Map(tools.Delegate(ThroughTags), Soup(ctx, itemprop='keywords')) identifiers = tools.Map( tools.Delegate(WorkIdentifier), tools.Try(tools.RunPython('get_dd', ctx, 'Permanent Link')), ) related_agents = tools.Concat( tools.Map(tools.Delegate(Creator), Soup(ctx, itemprop='creator')), tools.Map(tools.Delegate(Contributor), Soup(ctx, itemprop='contributor')), tools.Map(tools.Delegate(Publisher), Soup(ctx, itemprop='publisher')), ) class Extra: gwu_unit = tools.RunPython('get_dd', ctx, 'GW Unit')['#text'] related_url = tools.RunPython('get_dd', ctx, 'Related URL')['#text'] previous_publication_information = tools.RunPython( 'get_dd', ctx, 'Previous Publication Information')['#text'] depositor = tools.RunPython('get_dd', ctx, 'Depositor')['#text'] characterization = tools.RunPython('get_dd', ctx, 'Characterization')['#text'] def get_type(self, obj): return { 'http://schema.org/CreativeWork': 'CreativeWork', 'http://schema.org/Article': 'Article', 'http://schema.org/Book': 'Book', }.get(obj.soup.find('div')['itemtype'], 'CreativeWork') def get_title(self, obj): title = obj.h1.soup title.find('span', class_='label').decompose() return title.get_text() def get_dd(self, obj, dt): dt_tag = obj.soup.find('dt', string=dt) if dt_tag: return SoupXMLDict(soup=dt_tag.find_next_sibling('dd')) return None def get_rights_url(self, obj): dd = self.get_dd(obj, 'Rights') return dd.soup.find('i', class_='glyphicon-new-window').parent['href']
class Person(Parser): given_name = tools.ParseName(ctx.name).first family_name = tools.ParseName(ctx.name).last additional_name = tools.ParseName(ctx.name).middle suffix = tools.ParseName(ctx.name).suffix affiliations = tools.Map( tools.Delegate(Affiliation.using(entity=tools.Delegate(Organization))), tools.Maybe(ctx, 'arxiv:affiliation'))
class Registration(Parser): title = ctx[FIELDS['title']] description = ctx[FIELDS['summary']] date_published = tools.ParseDate( ctx[FIELDS['registration date']].timestamp) date_updated = tools.ParseDate(ctx[FIELDS['registration date']].timestamp) related_agents = tools.Concat( tools.Delegate(PrincipalInvestigator, ctx[FIELDS['primary investigator']]), tools.Delegate(OtherInvestigator, ctx[FIELDS['other investigator']]), tools.Map( tools.Delegate(AdditionalInvestigator), tools.RunPython('split_names', ctx[FIELDS['additional investigators']]))) identifiers = tools.Map(tools.Delegate(WorkIdentifier), tools.RunPython('get_link', ctx.id)) class Extra: registration_date = ctx[FIELDS['registration date']] questions_and_objectives = ctx[FIELDS['questions and objectives']] study_type = ctx[FIELDS['study type']] study_type_detail = ctx[FIELDS['study type other']] contact_details = ctx[FIELDS['contact details']] participating_institutions = ctx[FIELDS['participating institutions']] countries_of_recruitment = ctx[FIELDS['countries of recruitment']] funders = ctx[FIELDS['funders']] problems_studied = ctx[FIELDS['health conditions or problems studied']] patient_population = ctx[FIELDS['patient population']] interventions = ctx[FIELDS['interventions']] inclusion_criteria = ctx[FIELDS['inclusion criteria']] exclusion_criteria = ctx[FIELDS['exclusion criteria']] control_or_comparators = ctx[FIELDS['control or comparators']] primary_outcomes = ctx[FIELDS['primary outcomes']] key_secondary_outcomes = ctx[FIELDS['key secondary outcomes']] target_sample_size = ctx[FIELDS['target sample size']] recruitment_status = ctx[FIELDS['recruitment status']] other_recruitment_status = ctx[FIELDS['other recruitment status']] first_enrollment_date = ctx[FIELDS['first enrollment date']] expected_enrollment_completion_date = ctx[ FIELDS['expected enrollment completion date']] expected_research_completion_date = ctx[ FIELDS['expected research completion date']] ethical_approval = ctx[FIELDS['ethical approval']] ethical_approval_details = ctx[FIELDS['ethical approval details']] ethical_committee_judgment = ctx[FIELDS['ethical committee judgment']] data = ctx[FIELDS['data']] published_paper = ctx[FIELDS['published paper identifier']] study_website = ctx[FIELDS['study website']] study_results = ctx[FIELDS['study results']] def get_link(self, id): return LINK_FORMAT.format(id) def split_names(self, obj): if not obj: return None return obj.split(',')
class Preprint(Parser): title = ctx.item['dc:title'] description = ctx.item.description contributors = tools.Map(tools.Delegate(Contributor), ctx.item['dc:creator']) date_published = ctx.item['dc:date'] publishers = tools.Map( tools.Delegate(Association.using(entity=tools.Delegate(Publisher))), ctx.item['dc:publisher']) links = tools.Map(tools.Delegate(ThroughLinks), ctx.item['dc:identifier'])
class FunderRelation(Parser): schema = 'Funder' agent = tools.Delegate(FunderAgent, ctx) awards = tools.Map(tools.Delegate(ThroughAwards), tools.Try(tools.RunPython('get_award', ctx))) def get_award(self, obj): obj['awardURI'] return obj
class Person(Parser): given_name = tools.ParseName(ctx.author).first family_name = tools.ParseName(ctx.author).last additional_name = tools.ParseName(ctx.author).middle suffix = tools.ParseName(ctx.author).suffix identifiers = tools.Map( tools.Delegate(AgentIdentifier, tools.Try(ctx.email))) related_agents = tools.Map(tools.Delegate(IsAffiliatedWith), tools.Try(ctx.institution))
class Preprint(Parser): title = tools.Try(ctx['DC.Title']) description = tools.Try(ctx['DC.Description']) # is_deleted date_published = tools.ParseDate(tools.Try(ctx['article:published_time'])) date_updated = tools.ParseDate(tools.Try(ctx['DC.Date'])) # free_to_read_type # free_to_read_date rights = tools.Try(ctx['DC.Rights']) language = tools.Try(ctx['DC.Language']) subjects = tools.Map(tools.Delegate(ThroughSubjects), tools.Static('Biology'), tools.Subjects(tools.Try(ctx['subject-areas']))) tags = tools.Map(tools.Delegate(ThroughTags), tools.Try(ctx['category']), tools.Try(ctx['subject-areas'])) identifiers = tools.Map(tools.Delegate(WorkIdentifier), tools.Try(ctx['og:url']), ctx['citation_public_url'], ctx['citation_doi']) related_agents = tools.Concat( tools.Map(tools.Delegate(Publisher), tools.Try(ctx['DC.Publisher'])), tools.Map(tools.Delegate(Creator), tools.RunPython('get_contributors', ctx))) # related_works class Extra: identifiers = ctx['DC.Identifier'] access_rights = ctx['DC.AccessRights'] def get_contributors(self, link): authors = link.get('citation_author', []) if isinstance( link.get('citation_author', []), list) else [link['citation_author']] institutions = link.get( 'citation_author_institution', []) if isinstance( link.get('citation_author_institution', []), list) else [link['citation_author_institution']] emails = link.get('citation_author_email', []) if isinstance( link.get('citation_author_email', []), list) else [link['citation_author_email']] contribs = [] for author, email, institution in itertools.zip_longest( authors, emails, institutions): contrib = { 'author': author, 'institution': institution, 'email': email, } contribs.append(contrib) return contribs
class Person(Parser): given_name = tools.ParseName(ctx.author).first family_name = tools.ParseName(ctx.author).last additional_name = tools.ParseName(ctx.author).middle suffix = tools.ParseName(ctx.author).suffix emails = tools.Map(tools.Delegate(PersonEmail), tools.Try(ctx.email)) affiliations = tools.Map( tools.Delegate(Affiliation.using(entity=tools.Delegate(Organization))), tools.Try(ctx.institution))
class Preprint(Parser): title = ctx.item['dc:title'] description = ctx.item.description contributors = tools.Map(tools.Delegate(Contributor), ctx.item['dc:creator']) date_published = ctx.item['dc:date'] publishers = tools.Map( tools.Delegate(Association.using(entity=tools.Delegate(Publisher))), ctx.item['dc:publisher']) links = tools.Map(tools.Delegate(ThroughLinks), ctx.item['dc:identifier']) subjects = tools.Map( tools.Delegate(ThroughSubjects), tools.Concat(tools.Static('Biology and life sciences')))
class Preprint(Parser): title = ctx.entry.title description = ctx.entry.summary date_published = tools.ParseDate(ctx.entry.published) date_updated = tools.ParseDate(ctx.entry.updated) contributors = tools.Map(tools.Delegate(Contributor), ctx.entry.author) links = tools.Map(tools.Delegate(ThroughLinks), tools.Maybe(ctx.entry, 'arxiv:doi'), ctx.entry.id) subject = tools.Delegate(Tag, ctx.entry['arxiv:primary_category']) tags = tools.Map(tools.Delegate(ThroughTags), ctx.entry.category) class Extra: resource_id = ctx.entry.id journal_ref = tools.Maybe(ctx.entry, 'arxiv:journal_ref') comment = tools.Maybe(ctx.entry, 'arxiv:comment')
class CreativeWork(Parser): title = ctx.title identifiers = tools.Map(tools.Delegate(WorkIdentifier), ctx) related_agents = tools.Concat( tools.Map(tools.Delegate(FunderRelation), ctx), tools.Map(tools.Delegate(ContributorRelation), ctx), tools.Map(tools.Delegate(AgentWorkRelation), tools.Filter(lambda x: 'awardeeName' in x, ctx))) date_updated = tools.ParseDate(ctx.date) class Extra: public_access_mandate = ctx.publicAccessMandate
class FunderAgent(Parser): schema = tools.GuessAgentType(tools.OneOf(ctx.funderName, ctx.contributorName), default='organization') name = tools.OneOf(ctx.funderName, ctx.contributorName) identifiers = tools.Map( tools.Delegate(AgentIdentifier), tools.Try(tools.IRI( tools.OneOf(ctx.funderIdentifier, tools.RunPython(force_text, ctx.nameIdentifier), tools.Static(None))), exceptions=(ValueError, ))) class Extra: name_identifier = tools.Try(ctx.nameIdentifier) name_identifier_scheme = tools.Try( ctx.nameIdentifier['@nameIdentifierScheme']) name_identifier_scheme_uri = tools.Try( ctx.nameIdentifier['@schemeURI']) funder_identifier = tools.Try(ctx.funderIdentifier) funder_identifier_type = tools.Try(ctx.funderIdentifierType) contributor_type = tools.Try(ctx.contributorType)
class FundingAgent(Parser): schema = tools.GuessAgentType(ctx.sponsorName, default='organization') name = ctx.sponsorName identifiers = tools.Map(tools.Delegate(AgentIdentifier), tools.IRI(tools.Try(ctx.sponsorIdentifier)))
class OAIRelatedWork(Parser): schema = 'CreativeWork' identifiers = tools.Map(tools.Delegate(OAIWorkIdentifier), ctx) class Extra: identifier = ctx
class Person(Parser): given_name = tools.OneOf( ctx.embeds.users.data.attributes.given_name, ctx.embeds.users.errors[0].meta.given_name, ) family_name = tools.OneOf( ctx.embeds.users.data.attributes.family_name, ctx.embeds.users.errors[0].meta.family_name, ) additional_name = tools.OneOf( ctx.embeds.users.data.attributes.middle_names, ctx.embeds.users.errors[0].meta.middle_names, ) suffix = tools.OneOf( ctx.embeds.users.data.attributes.suffix, ctx.embeds.users.errors[0].meta.suffix, ) identifiers = tools.Map( tools.Delegate(AgentIdentifier), tools.RunPython('registered', ctx.embeds.users.data.links.html), tools.Try(ctx.embeds.users.data.links.profile_image), ) class Extra: locale = tools.Try(ctx.embeds.users.data.attributes.locale) date_registered = tools.Try( ctx.embeds.users.data.attributes.date_registered) active = tools.Try(ctx.embeds.users.data.attributes.active) timezone = tools.Try(ctx.embeds.users.data.attributes.timezone) def registered(self, context): if self.context['attributes']['unregistered_contributor']: return None return context
class Preprint(Parser): title = ctx.item['dc:title'] description = ctx.item.description date_published = tools.ParseDate(ctx.item['dc:date']) date_updated = tools.ParseDate(ctx.item['dc:date']) subjects = tools.Map( tools.Delegate(ThroughSubjects), tools.Concat(tools.Static('Biology')) ) identifiers = tools.Map(tools.Delegate(WorkIdentifier), ctx.item['dc:identifier']) related_agents = tools.Concat( tools.Delegate(Publisher, ctx.item['dc:publisher']), tools.Map(tools.Delegate(Creator), ctx.item['dc:creator']), )
class Project(CreativeWork): is_root = True related_works = tools.Map(tools.Delegate(IsPartOf), tools.Try(ctx.children)) related_agents = tools.Concat( tools.Map( tools.Delegate(Creator), tools.Filter(lambda x: x['attributes']['bibliographic'], ctx.contributors)), tools.Map( tools.Delegate(Contributor), tools.Filter(lambda x: not x['attributes']['bibliographic'], ctx.contributors)), tools.Map(tools.Delegate(AgentWorkRelation), tools.Try(ctx.institutions)), )
class ContributorAgent(Parser): schema = 'Person' family_name = ctx.piLastName given_name = ctx.piFirstName related_agents = tools.Map(tools.Delegate(IsAffiliatedWith), tools.Filter(lambda x: 'awardeeName' in x, ctx))
class Agent(Parser): schema = tools.GuessAgentType(ctx.name) name = ctx.name related_agents = tools.Map(tools.Delegate(IsAffiliatedWith), tools.Try(ctx.affiliation)) identifiers = tools.Map( tools.Delegate(AgentIdentifier), tools.Map(tools.IRI(), tools.Try(ctx.sameAs), tools.Try(ctx.email))) class Extra: givenName = tools.Try(ctx.givenName) familyName = tools.Try(ctx.familyName) additonalName = tools.Try(ctx.additionalName) name = tools.Try(ctx.name)
class DataCenterAgent(Parser): schema = tools.GuessAgentType(ctx.Data_Center_Name.Long_Name, default='organization') name = ctx.Data_Center_Name.Long_Name related_agents = tools.Map(tools.Delegate(IsAffiliatedWith), tools.Try(ctx.Personnel)) class Extra: data_center_short_name = ctx.Data_Center_Name.Short_Name
class CreativeWork(Parser): title = ctx.attributes.title description = ctx.attributes.description is_deleted = tools.Static(False) # date_published = date_updated = tools.ParseDate(ctx.attributes.date_modified) # free_to_read_type = # free_to_read_date = # rights = tools.Try(ctx.attributes.node_license) Doesn't seem to have an useful information # language = identifiers = tools.Map(tools.Delegate(WorkIdentifier), ctx.links.html, ctx.links.self) tags = tools.Map(tools.Delegate(ThroughTags), ctx.attributes.category, ctx.attributes.tags) class Extra: date_created = tools.ParseDate(ctx.attributes.date_created)
class CreatorPerson(Parser): schema = 'Person' suffix = tools.ParseName(ctx.creatorName).suffix family_name = tools.ParseName(ctx.creatorName).last given_name = tools.ParseName(ctx.creatorName).first additional_name = tools.ParseName(ctx.creatorName).middle affiliations = tools.Map( tools.Delegate( Affiliation.using(entity=tools.Delegate(CreatorOrganization))), tools.Concat(tools.Try(tools.RunPython(force_text, ctx.affiliation)))) identifiers = tools.Map(tools.Delegate(ThroughIdentifiers), tools.Try(ctx.nameIdentifier)) class Extra: name_identifier = tools.Try( tools.RunPython(force_text, ctx.nameIdentifier)) name_identifier_scheme = tools.Try( ctx.nameIdentifier['@nameIdentifierScheme']) name_identifier_scheme_uri = tools.Try( ctx.nameIdentifier['@schemeURI'])
class OAIAgentWorkRelation(Parser): schema = 'AgentWorkRelation' agent = tools.Delegate(OAIAgent, tools.RunPython('force_text', ctx)) cited_as = tools.RunPython('force_text', ctx) def force_text(self, data): if isinstance(data, dict): return data['#text'] if isinstance(data, str): return data raise TypeError(data)
class DataSet(Parser): title = tools.Join(tools.Try(ctx.record.metadata.DIF.Entry_Title)) description = tools.Try(ctx.record.metadata.DIF.Summary.Abstract) related_agents = tools.Map(tools.Delegate(AgentWorkRelation), tools.Try(ctx.record.metadata.DIF.Data_Center)) tags = tools.Map(tools.Delegate(ThroughTags), tools.Try(ctx.record.metadata.DIF.Metadata_Name), tools.Try(ctx.record.header.setSpec)) identifiers = tools.Map(tools.Delegate(WorkIdentifier), tools.Try(ctx.record.metadata.DIF)) date_updated = tools.ParseDate(ctx.record.header.datestamp) is_deleted = tools.RunPython('check_status', tools.Try(ctx.record.header['@status'])) class Extra: status = tools.Try(ctx.record.header['@status']) entry_id = tools.Try(ctx.record.metadata.DIF.Entry_ID) metadata_name = tools.Try(ctx.record.metadata.DIF.Metadata_Name) metadata_version = tools.Try(ctx.record.metadata.DIF.Metadata_Version) last_dif_revision_date = tools.Try( ctx.record.metadata.DIF.Last_DIF_Revision_Date) set_spec = ctx.record.header.setSpec def check_status(self, status): if status == 'deleted': return True return False
class Preprint(osf.Project): description = tools.Try(ctx.attributes.abstract) date_updated = tools.ParseDate(ctx.attributes.date_modified) date_published = tools.ParseDate(ctx.attributes.date_created) # NOTE: OSF has a direct mapping to SHARE's taxonomy. Subjects() is not needed subjects = tools.Map(tools.Delegate(ThroughSubjects), ctx.attributes.subjects) identifiers = tools.Map(tools.Delegate(WorkIdentifier), ctx.links.self, ctx.links.html, tools.Try(ctx.links.doi)) tags = tools.Map(tools.Delegate(ThroughTags), tools.Try(ctx.attributes.tags)) rights = tools.Try(ctx.attributes.node_license) related_works = tools.Static([]) related_agents = tools.Concat( tools.Map( tools.Delegate(osf.Creator), tools.Filter(lambda x: x['attributes']['bibliographic'], ctx.contributors)), tools.Map( tools.Delegate(osf.Contributor), tools.Filter(lambda x: not x['attributes']['bibliographic'], ctx.contributors)), )
class Person(Parser): given_name = ctx.embeds.users.data.attributes.given_name family_name = ctx.embeds.users.data.attributes.family_name additional_name = ctx.embeds.users.data.attributes.middle_names suffix = ctx.embeds.users.data.attributes.suffix url = ctx.embeds.users.data.links.html identifiers = tools.Map(tools.Delegate(ThroughIdentifiers), ctx.embeds.users.data.links.html) class Extra: nodes = ctx.embeds.users.data.relationships.nodes.links.related.href locale = ctx.embeds.users.data.attributes.locale date_registered = ctx.embeds.users.data.attributes.date_registered active = ctx.embeds.users.data.attributes.active timezone = ctx.embeds.users.data.attributes.timezone profile_image = ctx.embeds.users.data.links.profile_image