def schema(self): return { 'contributors': ('/authorNames', process_contributors), 'uris': { 'canonicalUri': '/articleFullUrl', 'objectUris': ('/doi', lambda x: ['http://dx.doi.org/' + x]) }, 'title': ('/bibliographyTitle', '/blurbTitle', lambda x, y: x or y), 'providerUpdatedDateTime': ('/published Date', lambda x: parse(x).isoformat()), 'description': '/blurbText', 'freeToRead': { 'startDate': ('/is_free', '/published Date', lambda x, y: y if x else None) }, 'otherProperties': build_properties( ('imageURL', '/imageUrl', { 'description': 'a image url' }), ('type', '/type'), ('isOpenAccess', '/isOpenAccess'), ('articleUrl', '/articleUrl'), ('articleFullUrl', '/articleFullUrl'), ('isFree', '/isFree'), ('isHighlyAccessed', '/isHighlyAccessed'), ('status', '/status'), ('abstractPath', '/abstractPath'), ('journal Id', '/journal Id'), ('article_host', '/article_host'), ('longCitation', '/longCitation'), ('is_subscription', '/is_subscription')) }
def schema(self): return { 'title': ('/title', lambda x: x if x else ''), 'providerUpdatedDateTime': ('/date', datetime_formatter), 'uris': { 'canonicalUri': '/uri', 'providerUris': ('/uri', lambda x: [x]), 'objectUris': ('/pmid', '/doi', process_object_uris) }, 'contributors': '/authors', 'subjects': '/subjects', 'tags': '/keywords', 'publisher': ('/publisher', lambda x: { 'name': x } if x else ''), 'otherProperties': build_properties( ('journalTitle', '/journalTitle'), ('abstract', ('/abstract', lambda x: x if x else '')), ('type', '/types'), ('ISSN', ('/issn', lambda x: x if x else '')), ('number', '/number'), ('ISBN', '/isbn'), ('startPage', '/startPage'), ('endPage', '/endPage'), ('volume', '/volume'), ) }
def test_arg_kwargs(self): def process_title(title, title1="test"): return title[0] + (title1[0] if isinstance(title1, list) else title1) def process_title2(title1="test"): return title1[0] if isinstance(title1, list) else title1 args = ("//dc:title/node()", ) kwargs = {"title1": "//dc:title/node()"} self.harvester.schema = updated_schema( TEST_SCHEMA, { 'title': (pack(*args, **kwargs), process_title), 'otherProperties': build_properties( ('title2', (pack(*args), process_title)), ('title3', (pack(**kwargs), process_title2)), ('title4', (pack('//dc:title/node()', title1='//dc:title/node()'), process_title)) ) } ) results = [self.harvester.normalize(record) for record in self.harvester.harvest(days_back=1)] for result in results: assert result['title'] == "TestTest" assert result['otherProperties'][0]['properties']['title2'] == 'Testtest' assert result['otherProperties'][1]['properties']['title3'] == 'Test' assert result['otherProperties'][2]['properties']['title4'] == "TestTest"
def schema(self): return { 'title': ('/title', lambda x: x or ''), 'description': ('/notes'), 'providerUpdatedDateTime': ('/metadata_modified', datetime_formatter), 'uris': { 'canonicalUri': ('/name', lambda x: construct_url(self.url, self.dataset_path, x)), # Construct new urls directing to LWBIN 'objectUris': ('/url', '/extras', process_object_uris) # Default urls from the metadata directing to source pages }, 'contributors': ('/author', '/author_email', process_contributors), 'licenses': ('/license_title', '/license_url', '/license_id', process_licenses), 'tags': ('/tags', lambda x: [tag['name'].lower() for tag in (x or [])]), 'freeToRead': { 'startDate': ('/isopen', '/metadata_created', lambda x, y: parse(y).date().isoformat() if x else None) }, 'otherProperties': build_properties( ('maintainer', '/maintainer'), ('maintainerEmail', '/maintainer_email'), ('revisionTimestamp', ('/revision_timestamp', datetime_formatter)), ('id', '/id'), ('metadataCreated', ('/metadata_created', datetime_formatter)), ('state', '/state'), ('version', '/version'), ('creatorUserId', '/creator_user_id'), ('type', '/type'), ('numberOfResources', '/num_resources'), ('numberOfTags', '/num_tags'), ('name', '/name'), ('groups', '/groups'), ) }
def schema(self): return { 'contributors': ('/creators', compose(default_name_parser, lambda authors: [author['creator'] for author in authors])), 'uris': ('/url', process_urls), 'title': '/title', 'providerUpdatedDateTime': ('/publicationDate', datetime_formatter), 'description': '/abstract', 'freeToRead': { 'startDate': ('/openaccess', '/publicationDate', lambda x, y: y if x == 'true' else None) }, 'publisher': { 'name': '/publisher' }, 'subjects': ('/genre', lambda x: [x] if x else []), 'otherProperties': build_properties( ('url', '/url'), ('doi', '/doi'), ('isbn', '/isbn'), ('printIsbn', '/printIsbn'), ('electronicIsbn', '/electronicIsbn'), ('volume', '/volume'), ('number', '/number'), ('startingPage', '/startingPage'), ('copyright', '/copyright'), ('identifier', '/identifier')) }
def schema(self): return { 'title': ('/title', lambda x: x if x else ''), 'providerUpdatedDateTime': ('/date', datetime_formatter), 'uris': { 'canonicalUri': '/uri', 'providerUris': ('/uri', lambda x: [x]), 'objectUris': ('/pmid', '/doi', process_object_uris) }, 'contributors': '/authors', 'subjects': '/subjects', 'tags': '/keywords', 'publisher': ('/publisher', lambda x: {'name': x} if x else ''), 'otherProperties': build_properties( ('journalTitle', '/journalTitle'), ('abstract', ('/abstract', lambda x: x if x else '')), ('type', '/types'), ('ISSN', ('/issn', lambda x: x if x else '')), ('number', '/number'), ('ISBN', '/isbn'), ('startPage', '/startPage'), ('endPage', '/endPage'), ('volume', '/volume'), ) }
def schema(self): return { 'contributors': ( '/creators', compose( default_name_parser, lambda authors: [author['creator'] for author in authors] ) ), 'uris': ('/url', process_urls), 'title': '/title', 'providerUpdatedDateTime': ('/publicationDate', datetime_formatter), 'description': '/abstract', 'freeToRead': { 'startDate': ('/openaccess', '/publicationDate', lambda x, y: y if x == 'true' else None) }, 'publisher': { 'name': '/publisher' }, 'subjects': ('/genre', lambda x: [x] if x else []), 'otherProperties': build_properties( ('url', '/url'), ('doi', '/doi'), ('isbn', '/isbn'), ('printIsbn', '/printIsbn'), ('electronicIsbn', '/electronicIsbn'), ('volume', '/volume'), ('number', '/number'), ('startingPage', '/startingPage'), ('copyright', '/copyright'), ('identifier', '/identifier') ) }
def schema(self): return { 'title': ('/title', lambda x: x[0] if x else ''), 'description': ('/subtitle', lambda x: x[0] if (isinstance(x, list) and x) else x or ''), 'providerUpdatedDateTime': ('/issued/date-parts', lambda x: parse(' '.join( [str(part) for part in x[0]])).date().isoformat().decode('utf-8')), 'uris': { 'canonicalUri': '/URL' }, 'contributors': ('/author', lambda x: [ process_contributor(*[ '{} {}'.format(entry.get('given'), entry.get('family')), entry.get('ORCID') ]) for entry in x ]), 'otherProperties': build_properties( ('journalTitle', '/container-title'), ('volume', '/volume'), ('tags', ('/subject', '/container-title', lambda x, y: [tag.lower() for tag in (x or []) + (y or [])])), ('issue', '/issue'), ('publisher', '/publisher'), ('type', '/type'), ('ISSN', '/ISSN'), ('ISBN', '/ISBN'), ('member', '/member'), ('score', '/score'), ('issued', '/issued'), ('deposited', '/deposited'), ('indexed', '/indexed'), ('page', '/page'), ('issue', '/issue'), ('volume', '/volume'), ('referenceCount', '/reference-count'), ('updatePolicy', '/update-policy'), ('depositedTimestamp', '/deposited/timestamp')) }
def schema(self): return { 'contributors': ('/authorNames', process_contributors), 'uris': { 'canonicalUri': '/articleFullUrl', 'objectUris': ('/doi', lambda x: ['http://dx.doi.org/' + x]) }, 'title': ('/bibliographyTitle', '/blurbTitle', lambda x, y: x or y), 'providerUpdatedDateTime': ('/published Date', lambda x: parse(x).isoformat()), 'description': '/blurbText', 'freeToRead': { 'startDate': ('/is_free', '/published Date', lambda x, y: y if x else None) }, 'otherProperties': build_properties( ('imageURL', '/imageUrl', {'description': 'a image url'}), ('type', '/type'), ('isOpenAccess', '/isOpenAccess'), ('articleUrl', '/articleUrl'), ('articleFullUrl', '/articleFullUrl'), ('isFree', '/isFree'), ('isHighlyAccessed', '/isHighlyAccessed'), ('status', '/status'), ('abstractPath', '/abstractPath'), ('journal Id', '/journal Id'), ('article_host', '/article_host'), ('longCitation', '/longCitation'), ('is_subscription', '/is_subscription') ) }
def formatted_properties(self): return { 'otherProperties': build_properties(*[(item, ('//dc:{}/node()'.format(item), '//ns0:{}/node()'.format(item), self.resolve_property)) for item in self.property_list]) }
def formatted_properties(self): return { 'otherProperties': build_properties(*[(item, ( '//dc:{}/node()'.format(item), '//ns0:{}/node()'.format(item), self.resolve_property) ) for item in self.property_list]) }
def schema(self): return { "contributors": ('//PIS/PI/PI_NAME/node()', '//ORG_NAME', nih_name_parser), "uris": { "canonicalUri": ("//APPLICATION_ID/node()", compose(self.construct_project_url, single_result)), "descriptorUris": ("//APPLICATION_ID/node()", "//FOA_NUMBER/node()", self.construct_descriptor_uris) }, "providerUpdatedDateTime": ("AWARD_NOTICE_DATE/node()", compose(datetime_formatter, single_result)), "title": ('//PROJECT_TITLE/node()', single_result), "tags": ('//PROJECT_TERMSX/TERM/node()'), "otherProperties": build_properties( ("applicationID", "//APPLICATION_ID/node()"), ('activity', '//ACTIVITY/node()'), ('administeringIC', '//ADMINISTERING_IC/node()'), ('arraFunded', '//ARRA_FUNDED/node()'), ('budgetStart', '//BUDGET_START/node()'), ('budgetEnd', '//BUDGET_END/node()'), ('FOANumber', '//FOA_NUMBER/node()'), ('fullProjectNumber', '//FULL_PROJECT_NUM/node()'), ('fundingICs', '//FUNDING_ICs/node()'), ('fiscalYear', '//FY/node()'), ('NIHSpendingCats', '//NIH_SPENDING_CATS/@xsi:nil'), ('organizationCity', '//ORG_CITY/node()'), ('organizationCountry', '//ORG_CONTRY/node()'), ('organizationDistrict', '//ORG_DISTRICT/node()'), ('organizationDUNS', '//ORG_DUNS/node()'), ('organizationDept', '//ORG_DEPT/node()'), ('organizationFIPS', '//ORG_FIPS/node()'), ('organizationState', '//ORG_STATE/node()'), ('organizationZipcode', '//ORG_ZIPCODE/node()'), ('ICName', '//IC_NAME/node()'), ('organizationName', '//ORG_NAME/node()'), ('projectStart', '//PROJECT_START/node()'), ('projectEnd', '//PROJECT_END/node()'), ('PHR', '//PHR/node()'), ('serialNumber', '//SERIAL_NUMBER/node()'), ('studySection', '//STUDY_SECTION/node()'), ('studySectionName', '//STUDY_SECTION_NAME/node()'), ('supportYear', '//SUPPORT_YEAR/node()'), ('suffix', '//SUFFIX/node()'), ('subProjectID', '//SUBPROJECT_ID/@xsi:nil'), ('totalCost', '//TOTAL_COST/node()'), ('totalCostSubProject', '//TOTAL_COST_SUB_PROJECT/node()'), ('coreProjectNumber', '//CORE_PROJECT_NUM/node()'), ('CFDACode', '//CFDA_CODE/node()'), ('programOfficerName', '//PROGRAM_OFFICER_NAME/node()'), ('edInstType', '//ED_INST_TYPE/node()'), ('awardNoticeDate', '//AWARD_NOTICE_DATE/node()'), ('fundingMechanism', '//FUNDING_MECHANISM/node()')) }
def schema(self): properties = { 'otherProperties': build_properties(*[(item, ('//dc:{}/node()'.format(item), '//ns0:{}/node()'.format(item), self.resolve_property)) for item in self.property_list]) } return updated_schema(OAISCHEMA, properties)
def schema(self): properties = { 'otherProperties': build_properties(*[(item, ( '//dc:{}/node()'.format(item), '//ns0:{}/node()'.format(item), self.resolve_property) ) for item in self.property_list]) } return updated_schema(OAISCHEMA, properties)
def schema(self): return { "contributors": ('//PIS/PI/PI_NAME/node()', '//ORG_NAME', nih_name_parser), "uris": { "canonicalUri": ("//APPLICATION_ID/node()", compose(self.construct_project_url, single_result)), "descriptorUris": ("//APPLICATION_ID/node()", "//FOA_NUMBER/node()", self.construct_descriptor_uris) }, "providerUpdatedDateTime": ("AWARD_NOTICE_DATE/node()", compose(datetime_formatter, single_result)), "title": ('//PROJECT_TITLE/node()', single_result), "tags": ('//PROJECT_TERMSX/TERM/node()'), "otherProperties": build_properties( ("applicationID", "//APPLICATION_ID/node()"), ('activity', '//ACTIVITY/node()'), ('administeringIC', '//ADMINISTERING_IC/node()'), ('arraFunded', '//ARRA_FUNDED/node()'), ('budgetStart', '//BUDGET_START/node()'), ('budgetEnd', '//BUDGET_END/node()'), ('FOANumber', '//FOA_NUMBER/node()'), ('fullProjectNumber', '//FULL_PROJECT_NUM/node()'), ('fundingICs', '//FUNDING_ICs/node()'), ('fiscalYear', '//FY/node()'), ('NIHSpendingCats', '//NIH_SPENDING_CATS/@xsi:nil'), ('organizationCity', '//ORG_CITY/node()'), ('organizationCountry', '//ORG_CONTRY/node()'), ('organizationDistrict', '//ORG_DISTRICT/node()'), ('organizationDUNS', '//ORG_DUNS/node()'), ('organizationDept', '//ORG_DEPT/node()'), ('organizationFIPS', '//ORG_FIPS/node()'), ('organizationState', '//ORG_STATE/node()'), ('organizationZipcode', '//ORG_ZIPCODE/node()'), ('ICName', '//IC_NAME/node()'), ('organizationName', '//ORG_NAME/node()'), ('projectStart', '//PROJECT_START/node()'), ('projectEnd', '//PROJECT_END/node()'), ('PHR', '//PHR/node()'), ('serialNumber', '//SERIAL_NUMBER/node()'), ('studySection', '//STUDY_SECTION/node()'), ('studySectionName', '//STUDY_SECTION_NAME/node()'), ('supportYear', '//SUPPORT_YEAR/node()'), ('suffix', '//SUFFIX/node()'), ('subProjectID', '//SUBPROJECT_ID/@xsi:nil'), ('totalCost', '//TOTAL_COST/node()'), ('totalCostSubProject', '//TOTAL_COST_SUB_PROJECT/node()'), ('coreProjectNumber', '//CORE_PROJECT_NUM/node()'), ('CFDACode', '//CFDA_CODE/node()'), ('programOfficerName', '//PROGRAM_OFFICER_NAME/node()'), ('edInstType', '//ED_INST_TYPE/node()'), ('awardNoticeDate', '//AWARD_NOTICE_DATE/node()'), ('fundingMechanism', '//FUNDING_MECHANISM/node()') ) }
def schema(self): return { 'contributors': ('/authors', process_contributors), 'uris': { 'objectUris': ('/url', '/full_dataset_url', compose(filter_none, lambda x, y: [x, y])), 'descriptorUris': ('/DOI', '/paper_url', compose(filter_none, lambda x, y: [('http://dx.doi.org/{}'.format(x) if x else None), y])), 'canonicalUri': '/url', }, 'title': '/name', 'providerUpdatedDateTime': ('/modify_date', datetime_formatter), 'description': '/description', 'otherProperties': build_properties( ('owner_name', '/owner_name'), ) }
def schema(self): return { 'contributors': ('/contributors', process_contributors), 'title': ('/title', process_null), 'providerUpdatedDateTime': ('/date_created', parse_date), 'description': ('/description', process_null), 'uris': { 'canonicalUri': ('/url', lambda x: 'http://osf.io' + x), }, 'tags': ('/tags', process_tags), 'otherProperties': build_properties( ('parent_title', '/parent_title'), ('category', '/category'), ('wiki_link', '/wiki_link'), ('is_component', '/is_component'), ('is_registration', '/is_registration'), ('parent_url', '/parent_url'), ('journal Id', '/journal Id')) }
def schema(self): return { "title": ("/title", lambda x: x[0] if x else ""), "description": ("/subtitle", lambda x: x[0] if (isinstance(x, list) and x) else x or ""), "providerUpdatedDateTime": ( "/issued/date-parts", lambda x: parse(" ".join([str(part) for part in x[0]])).date().isoformat(), ), "uris": {"canonicalUri": "/URL"}, "contributors": ( "/author", compose( lambda x: [ process_contributor( *["{} {}".format(entry.get("given"), entry.get("family")), entry.get("ORCID")] ) for entry in x ], lambda x: x or [], ), ), "sponsorships": ("/funder", lambda x: process_sponsorships(x) if x else []), "otherProperties": build_properties( ("journalTitle", "/container-title"), ("volume", "/volume"), ("tags", ("/subject", "/container-title", lambda x, y: [tag.lower() for tag in (x or []) + (y or [])])), ("issue", "/issue"), ("publisher", "/publisher"), ("type", "/type"), ("ISSN", "/ISSN"), ("ISBN", "/ISBN"), ("member", "/member"), ("score", "/score"), ("issued", "/issued"), ("deposited", "/deposited"), ("indexed", "/indexed"), ("page", "/page"), ("issue", "/issue"), ("volume", "/volume"), ("referenceCount", "/reference-count"), ("updatePolicy", "/update-policy"), ("depositedTimestamp", "/deposited/timestamp"), ), }
def schema(self): return { 'title': ('/title', lambda x: x or ''), 'description': ('/notes'), 'providerUpdatedDateTime': ('/metadata_modified', datetime_formatter), 'uris': { 'canonicalUri': ('/name', lambda x: construct_url(self.url, self.dataset_path, x) ), # Construct new urls directing to LWBIN 'objectUris': ('/url', '/extras', process_object_uris ) # Default urls from the metadata directing to source pages }, 'contributors': ('/author', '/author_email', process_contributors), 'licenses': ('/license_title', '/license_url', '/license_id', process_licenses), 'tags': ('/tags', lambda x: [tag['name'].lower() for tag in (x or [])]), 'freeToRead': { 'startDate': ('/isopen', '/metadata_created', lambda x, y: parse(y).date().isoformat() if x else None) }, 'otherProperties': build_properties( ('maintainer', '/maintainer'), ('maintainerEmail', '/maintainer_email'), ('revisionTimestamp', ('/revision_timestamp', datetime_formatter)), ('id', '/id'), ('metadataCreated', ('/metadata_created', datetime_formatter)), ('state', '/state'), ('version', '/version'), ('creatorUserId', '/creator_user_id'), ('type', '/type'), ('numberOfResources', '/num_resources'), ('numberOfTags', '/num_tags'), ('name', '/name'), ('groups', '/groups'), ) }
def schema(self): return { 'contributors': ('/contributors', process_contributors), 'title': ('/title', process_null), 'providerUpdatedDateTime': ('/date_registered', date_formatter), 'description': ('/description', process_null), 'uris': { 'canonicalUri': ('/url', lambda x: 'http://osf.io' + x), }, 'tags': ('/tags', process_tags), 'otherProperties': build_properties( ('parent_title', '/parent_title'), ('category', '/category'), ('wiki_link', '/wiki_link'), ('is_component', '/is_component'), ('is_registration', '/is_registration'), ('parent_url', '/parent_url'), ('journal Id', '/journal Id') ) }
def schema(self): return { 'contributors': ('/contributors', process_contributors), 'title': ('/title', lambda x: x or ''), 'providerUpdatedDateTime': ('/date_registered', datetime_formatter), 'description': '/description', 'uris': { 'canonicalUri': ('/url', url_from_guid), 'providerUris': ('/url', compose(coerce_to_list, url_from_guid)) }, 'tags': '/tags', 'otherProperties': build_properties( ('parent_title', '/parent_title'), ('category', '/category'), ('wiki_link', '/wiki_link'), ('is_component', '/is_component'), ('is_registration', '/is_registration'), ('parent_url', '/parent_url'), ('journal Id', '/journal Id') ) }
def schema(self): return { 'contributors': ('/authors', process_contributors), 'uris': { 'objectUris': ('/url', '/full_dataset_url', compose(filter_none, lambda x, y: [x, y])), 'descriptorUris': ('/DOI', '/paper_url', compose( filter_none, lambda x, y: [('http://dx.doi.org/{}'.format(x) if x else None), y])), 'canonicalUri': '/url', }, 'title': '/name', 'providerUpdatedDateTime': ('/modify_date', datetime_formatter), 'description': '/description', 'otherProperties': build_properties( ('owner_name', '/owner_name'), ) }
def schema(self): return { 'title': ('/title', lambda x: x[0] if x else ''), 'description': ('/subtitle', lambda x: x[0] if (isinstance(x, list) and x) else x or ''), 'providerUpdatedDateTime': ('/issued/date-parts', lambda x: parse(' '.join([part for part in x[0]])).isoformat().decode('utf-8')), 'uris': { 'canonicalUri': '/URL' }, 'contributors': ('/author', lambda x: [ process_contributor(*[ '{} {}'.format(entry.get('given'), entry.get('family')), entry.get('ORCID') ]) for entry in x ]), 'otherProperties': build_properties( ('referenceCount', '/reference-count'), ('updatePolicy', '/update-policy'), ('depositedTimestamp', '/deposited/timestamp'), ('Empty', '/trash/not-here'), ('Empty2', '/') ) }
def schema(self): return { 'title': ('/title', lambda x: x[0] if x else ''), 'description': ('/subtitle', lambda x: x[0] if (isinstance(x, list) and x) else x or ''), 'providerUpdatedDateTime': ('/issued/date-parts', compose(datetime_formatter, lambda x: ' '.join([str(part) for part in x[0]]))), 'uris': { 'canonicalUri': '/URL' }, 'contributors': ('/author', compose(lambda x: [ process_contributor(*[ '{} {}'.format(entry.get('given'), entry.get('family')), entry.get('ORCID') ]) for entry in x ], lambda x: x or [])), 'sponsorships': ('/funder', lambda x: process_sponsorships(x) if x else []), 'tags': ('/subject', '/container-title', lambda x, y: [tag.lower() for tag in (x or []) + (y or [])]), 'subjects': ('/subject', '/container-title', lambda x, y: [tag.lower() for tag in (x or []) + (y or [])]), 'otherProperties': build_properties( ('journalTitle', '/container-title'), ('volume', '/volume'), ('issue', '/issue'), ('publisher', '/publisher'), ('type', '/type'), ('ISSN', '/ISSN'), ('ISBN', '/ISBN'), ('member', '/member'), ('score', '/score'), ('issued', '/issued'), ('deposited', '/deposited'), ('indexed', '/indexed'), ('page', '/page'), ('issue', '/issue'), ('volume', '/volume'), ('referenceCount', '/reference-count'), ('updatePolicy', '/update-policy'), ('depositedTimestamp', '/deposited/timestamp') ) }
def schema(self): return { 'title': ('/title', lambda x: x[0] if x else ''), 'description': ('/subtitle', lambda x: x[0] if (isinstance(x, list) and x) else x or ''), 'providerUpdatedDateTime': ('/issued/date-parts', lambda x: datetime_formatter(' '.join([part for part in x[0]]))), 'uris': { 'canonicalUri': '/URL' }, 'contributors': ('/author', lambda x: [ process_contributor(*[ '{} {}'.format(entry.get('given'), entry.get('family')), entry.get('ORCID') ]) for entry in x ]), 'otherProperties': build_properties(('referenceCount', '/reference-count'), ('updatePolicy', '/update-policy'), ('depositedTimestamp', '/deposited/timestamp'), ('Empty', '/trash/not-here'), ('Empty2', '/')) }
def test_arg_kwargs(self): def process_title(title, title1="test"): return title[0] + (title1[0] if isinstance(title1, list) else title1) def process_title2(title1="test"): return title1[0] if isinstance(title1, list) else title1 args = ("//dc:title/node()", ) kwargs = {"title1": "//dc:title/node()"} self.harvester.schema = updated_schema( TEST_SCHEMA, { 'title': (pack(*args, **kwargs), process_title), 'otherProperties': build_properties( ('title2', (pack(*args), process_title)), ('title3', (pack(**kwargs), process_title2)), ('title4', (pack('//dc:title/node()', title1='//dc:title/node()'), process_title))) }) results = [ self.harvester.normalize(record) for record in self.harvester.harvest(days_back=1) ] for result in results: assert result['title'] == "TestTest" assert result['otherProperties'][0]['properties'][ 'title2'] == 'Testtest' assert result['otherProperties'][1]['properties'][ 'title3'] == 'Test' assert result['otherProperties'][2]['properties'][ 'title4'] == "TestTest"
class USGSHarvester(JSONHarvester): short_name = 'usgs' long_name = 'United States Geological Survey' url = 'https://pubs.er.usgs.gov/' DEFAULT_ENCODING = 'UTF-8' URL = 'https://pubs.er.usgs.gov/pubs-services/publication?' schema = { 'title': '/title', 'description': '/docAbstract', 'providerUpdatedDateTime': ('/lastModifiedDate', datetime_formatter), 'uris': { 'canonicalUri': ('/id', 'https://pubs.er.usgs.gov/publication/{}'.format), 'providerUris': [('/id', 'https://pubs.er.usgs.gov/publication/{}'.format)], 'descriptorUris': [('/doi', 'https://dx.doi.org/{}'.format)] }, 'contributors': ('/contributors/authors', process_contributors), 'otherProperties': build_properties( ('serviceID', ('/id', str)), ('definedType', '/defined_type'), ('type', '/type'), ('links', '/links'), ('publisher', '/publisher'), ('publishedDate', '/displayToPublicDate'), ('publicationYear', '/publicationYear'), ('issue', '/issue'), ('volume', '/volume'), ('language', '/language'), ('indexId', '/indexId'), ('publicationSubtype', '/publicationSubtype'), ('startPage', '/startPage'), ('endPage', '/endPage'), ('onlineOnly', '/onlineOnly'), ('additionalOnlineFiles', '/additionalOnlineFiles'), ('country', '/country'), ('state', '/state'), ('ipdsId', '/ipdsId'), ('links', '/links'), ('doi', '/doi'), ('contributors', '/contributors'), ('otherGeospatial', '/otherGeospatial'), ('geographicExtents', '/geographicExtents'), ) } def harvest(self, start_date=None, end_date=None): # This API does not support date ranges start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) # days_back = the number of days between start_date and now, defaulting to settings.DAYS_BACK days_back = settings.DAYS_BACK search_url = '{0}mod_x_days={1}'.format(self.URL, days_back) record_list = [] for record in self.get_records(search_url): doc_id = record['id'] record_list.append( RawDocument({ 'doc': json.dumps(record), 'source': self.short_name, 'docID': six.text_type(doc_id), 'filetype': 'json' })) return record_list def get_records(self, search_url): records = requests.get(search_url) total_records = records.json()['recordCount'] logger.info('Harvesting {} records'.format(total_records)) page_number = 1 count = 0 while records.json()['records']: record_list = records.json()['records'] for record in record_list: count += 1 yield record page_number += 1 records = requests.get(search_url + '&page_number={}'.format(page_number), throttle=3) logger.info('{} documents harvested'.format(count))
class ClinicalTrialsHarvester(XMLHarvester): short_name = 'clinicaltrials' long_name = 'ClinicalTrials.gov' url = 'https://clinicaltrials.gov/' DEFAULT_ENCODING = 'UTF-8' record_encoding = None # TODO - clinicaltrials elements have a lot of extra metadata - at some # point in the future we should do a more thorough audit. schema = { "contributors": ('//overall_official/last_name/node()', default_name_parser), "uris": { "canonicalUri": ("//required_header/url/node()", single_result) }, "providerUpdatedDateTime": ("lastchanged_date/node()", compose(datetime_formatter, single_result)), "title": ('//official_title/node()', '//brief_title/node()', lambda x, y: single_result(x) or single_result(y)), "description": ('//brief_summary/textblock/node()', '//brief_summary/textblock/node()', lambda x, y: single_result(x) or single_result(y)), "tags": ("//keyword/node()", lambda tags: [tag.lower() for tag in tags]), "sponsorships": [{ "sponsor": { "sponsorName": ("//sponsors/lead_sponsor/agency/node()", single_result) } }, { "sponsor": { "sponsorName": ("//sponsors/collaborator/agency/node()", single_result) } }], "otherProperties": build_properties( ("serviceID", "//nct_id/node()"), ('oversightAuthority', '//oversight_info/authority/node()'), ('studyDesign', '//study_design/node()'), ('numberOfArms', '//number_of_arms/node()'), ('source', '//source/node()'), ('verificationDate', '//verification_date/node()'), ('lastChanged', '//lastchanged_date/node()'), ('condition', '//condition/node()'), ('verificationDate', '//verification_date/node()'), ('lastChanged', '//lastchanged_date/node()'), ('status', '//status/node()'), ('locationCountries', '//location_countries/country/node()'), ('isFDARegulated', '//is_fda_regulated/node()'), ('isSection801', '//is_section_801/node()'), ('hasExpandedAccess', '//has_expanded_access/node()'), ('leadSponsorAgencyClass', '//lead_sponsor/agency_class/node()'), ('collaborator', '//collaborator/agency/node()'), ('collaboratorAgencyClass', '//collaborator/agency_class/node()'), ('measure', '//primary_outcome/measure/node()'), ('timeFrame', '//primary_outcome/time_frame/node()'), ('safetyIssue', '//primary_outcome/safety_issue/node()'), ('secondaryOutcomes', '//secondary_outcome/measure/node()'), ('enrollment', '//enrollment/node()'), ('armGroup', '//arm_group/arm_group_label/node()'), ('intervention', '//intervention/intervention_type/node()'), ('eligibility', ('//eligibility/node()', compose(lambda x: list(map(element_to_dict, x)), lambda x: list(filter(non_string, x))))), ('link', '//link/url/node()'), ('responsible_party', '//responsible_party/responsible_party_full_name/node()')) } @property def namespaces(self): return None def harvest(self, start_date=None, end_date=None): """ First, get a list of all recently updated study urls, then get the xml one by one and save it into a list of docs including other information """ start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() end_month = end_date.strftime('%m') end_day = end_date.strftime('%d') end_year = end_date.strftime('%Y') start_month = start_date.strftime('%m') start_day = start_date.strftime('%d') start_year = start_date.strftime('%Y') base_url = 'http://clinicaltrials.gov/ct2/results?lup_s=' url_end = '{}%2F{}%2F{}&lup_e={}%2F{}%2F{}&displayxml=true'.\ format(start_month, start_day, start_year, end_month, end_day, end_year) url = base_url + url_end # grab the total number of studies initial_request = requests.get(url) record_encoding = initial_request.encoding initial_request_xml = etree.XML(initial_request.content) count = int(initial_request_xml.xpath('//search_results/@count')[0]) xml_list = [] if int(count) > 0: # get a new url with all results in it url = url + '&count=' + str(count) total_requests = requests.get(url) initial_doc = etree.XML(total_requests.content) # make a list of urls from that full list of studies study_urls = [] for study in initial_doc.xpath('//clinical_study'): study_urls.append( study.xpath('url/node()')[0] + '?displayxml=true') # grab each of those urls for full content logger.info("There are {} urls to harvest - be patient...".format( len(study_urls))) count = 0 official_count = 0 for study_url in study_urls: try: content = requests.get(study_url) except requests.exceptions.ConnectionError as e: logger.info( 'Connection error: {}, wait a bit...'.format(e)) time.sleep(30) continue doc = etree.XML(content.content) record = etree.tostring(doc, encoding=record_encoding) doc_id = doc.xpath('//nct_id/node()')[0] xml_list.append( RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml', })) official_count += 1 count += 1 if count % 100 == 0: logger.info( "You've requested {} studies, keep going!".format( official_count)) count = 0 return xml_list
class HarvardDataverseHarvester(JSONHarvester): short_name = 'harvarddataverse' long_name = 'Harvard Dataverse' url = 'https://dataverse.harvard.edu' namespaces = {} MAX_ITEMS_PER_REQUEST = 1000 URL = 'https://dataverse.harvard.edu/api/search/?q=*' TYPE = 'dataset' schema = { 'title': '/name', 'description': '/description', 'contributors': ('/authors', default_name_parser), 'providerUpdatedDateTime': ('/published_at', date_formatter), 'uris': { 'canonicalUri': '/url', 'objectUris': [('/image_url')] }, 'otherProperties': build_properties(('serviceID', '/global_id'), ('type', '/type')) } def harvest(self, start_date=None, end_date=None): start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat() end_date = (end_date or date.today()).isoformat() query = furl.furl(self.URL) query.args['type'] = self.TYPE query.args['per_page'] = self.MAX_ITEMS_PER_REQUEST query.args['key'] = HARVARD_DATAVERSE_API_KEY query.args['sort'] = 'date' query.args['order'] = 'asc' query.args['fq'] = 'dateSort:[{}T00:00:00Z TO {}T00:00:00Z]'.format( start_date, end_date) records = self.get_records(query.url) record_list = [] for record in records: doc_id = record['global_id'] record_list.append( RawDocument({ 'doc': json.dumps(record), 'source': self.short_name, 'docID': doc_id, 'filetype': 'json' })) return record_list def get_records(self, search_url): records = requests.get(search_url) total_records = records.json()['data']['total_count'] start = 0 all_records = [] while len(all_records) < total_records: records = requests.get(search_url + '&start={}'.format(str(start))) record_list = records.json()['data']['items'] for record in record_list: all_records.append(record) start += self.MAX_ITEMS_PER_REQUEST return all_records
class NSFAwards(JSONHarvester): short_name = 'nsfawards' long_name = 'NSF Awards' url = 'http://www.nsf.gov/' URL = 'http://api.nsf.gov/services/v1/awards.json?dateStart=' schema = { 'title': '/title', 'contributors': ('/piFirstName', '/piLastName', '/awardeeName', process_NSF_contributors), 'providerUpdatedDateTime': ('/date', datetime_formatter), 'uris': ('/id', process_nsf_uris), 'sponsorships': ('/agency', '/id', '/title', process_sponsorships), 'otherProperties': build_properties( ('awardeeCity', '/awardeeCity'), ('awardeeStateCode', '/awardeeStateCode'), ('fundsObligatedAmt', '/fundsObligatedAmt'), ('publicAccessMandate', '/publicAccessMandate'), ) } def harvest(self, start_date=None, end_date=None): start_date = start_date if start_date else date.today() - timedelta( settings.DAYS_BACK) end_date = end_date - timedelta( 1) if end_date else date.today() - timedelta(1) search_url = '{0}{1}&dateEnd={2}'.format( self.URL, start_date.strftime('%m/%d/%Y'), end_date.strftime('%m/%d/%Y')) records = self.get_records(search_url) record_list = [] for record in records: doc_id = record['id'] record_list.append( RawDocument({ 'doc': json.dumps(record), 'source': self.short_name, 'docID': six.text_type(doc_id), 'filetype': 'json' })) return record_list def get_records(self, search_url): records = requests.get(search_url).json()['response'].get('award') offset = 1 all_records = [] while len(records) == 25: for record in records: all_records.append(record) offset += 25 records = requests.get(search_url + '&offset={}'.format(str(offset)), throttle=3).json()['response'].get('award') all_records.extend(records) return all_records
def formatted_properties(self): return {"otherProperties": build_properties(*list(map(self.format_property, self.property_list)))}
class ELifeHarvester(XMLHarvester): short_name = 'elife' long_name = 'eLife Sciences' url = 'http://elifesciences.org/' DEFAULT_ENCODING = 'UTF-8' record_encoding = None namespaces = {} MAX_ROWS_PER_REQUEST = 999 BASE_URL = 'https://api.github.com/repos/elifesciences/elife-article-xml/commits?' BASE_COMMIT_URL = 'https://api.github.com/repos/elifesciences/elife-article-xml/commits/{}' BASE_DATA_URL = 'https://raw.githubusercontent.com/elifesciences/elife-article-xml/master/{}' def harvest(self, start_date=None, end_date=None): start_date = start_date or datetime.date.today() - datetime.timedelta(settings.DAYS_BACK) end_date = end_date or datetime.date.today() shas = fetch_commits(self.BASE_URL, start_date.isoformat(), end_date.isoformat()) files = list(set(chain.from_iterable([ fetch_file_names(self.BASE_COMMIT_URL, sha) for sha in shas]))) files = filter(lambda filename: filename.endswith('.xml'), files) xml_records = [ fetch_xml(self.BASE_DATA_URL, filename) for filename in files ] return [ RawDocument({ 'filetype': 'xml', 'source': self.short_name, 'doc': etree.tostring(record), 'docID': record.xpath('//article-id[@*]')[0].text, }) for record in xml_records ] schema = { 'uris': { 'canonicalUri': ('//article-id/node()', compose('http://dx.doi.org/10.7554/eLife.{}'.format, single_result)), 'objectUri': ('//article-id/node()', compose('http://dx.doi.org/10.7554/eLife.{}'.format, single_result)) }, 'contributors': ('//article-meta/contrib-group/contrib/name/*[not(self::suffix)]/node()', elife_name_parser), 'providerUpdatedDateTime': ('//article-meta/pub-date[@publication-format="electronic"]/*/node()', compose(datetime_formatter, elife_date_parser)), 'title': ('//article-meta/title-group/article-title//text()', collapse_list), 'description': ('//abstract[not(@abstract-type="executive-summary")]/p[1]//text()', collapse_list), 'publisher': { 'name': ('//publisher-name/node()', single_result) }, 'subjects': '//article-meta/article-categories/descendant::text()', 'freeToRead': { 'startDate': ('//article-meta/pub-date[@publication-format="electronic"]/*/node()', elife_date_parser) }, 'tags': '//kwd/text()', 'otherProperties': build_properties( ('rights', ('//permissions/license/license-p/ext-link/text()', single_result)) ) }
'description': 'This study seeks to understand how humans impact\ the dietary patterns of eight free-ranging vervet monkey\ (Chlorocebus pygerythrus) groups in South Africa using stable\ isotope analysis.', 'providerUpdatedDateTime': '2015-02-23T00:00:00', 'shareProperties': { 'source': 'test' } } TEST_SCHEMA = updated_schema(DOESCHEMA, { "title": ("//dc:title/node()", lambda x: "Title overwritten"), "otherProperties": build_properties( ("title1", ("//dc:title/node()", single_result)), ("title2", ("//dc:title/node()", lambda x: single_result(x).lower())), ("title3", ("//dc:title/node()", "//dc:title/node()", lambda x, y: single_result(x) + single_result(y).lower())) ) }) TEST_NAMESPACES = { 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'dc': 'http://purl.org/dc/elements/1.1/', 'dcq': 'http://purl.org/dc/terms/' } TEST_XML_DOC = b''' <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcq="http://purl.org/dc/terms/"> <records count="97" morepages="true" start="1" end="10">
class PlosHarvester(XMLHarvester): short_name = 'plos' long_name = 'Public Library of Science' url = 'http://www.plos.org/' namespaces = {} MAX_ROWS_PER_REQUEST = 999 BASE_URL = 'http://api.plos.org/search' def fetch_rows(self, start_date, end_date): query = 'publication_date:[{}T00:00:00Z TO {}T00:00:00Z]'.format( start_date, end_date) resp = requests.get(self.BASE_URL, params={ 'q': query, 'rows': '0', 'api_key': PLOS_API_KEY, }) total_rows = etree.XML(resp.content).xpath('//result/@numFound') total_rows = int(total_rows[0]) if total_rows else 0 current_row = 0 while current_row < total_rows: response = requests.get(self.BASE_URL, throttle=5, params={ 'q': query, 'start': current_row, 'api_key': PLOS_API_KEY, 'rows': self.MAX_ROWS_PER_REQUEST, }) for doc in etree.XML(response.content).xpath('//doc'): yield doc current_row += self.MAX_ROWS_PER_REQUEST def harvest(self, start_date=None, end_date=None): start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() if not PLOS_API_KEY: return [] return [ RawDocument({ 'filetype': 'xml', 'source': self.short_name, 'doc': etree.tostring(row), 'docID': row.xpath("str[@name='id']")[0].text, }) for row in self.fetch_rows(start_date.isoformat(), end_date.isoformat()) if row.xpath("arr[@name='abstract']") or row.xpath("str[@name='author_display']") ] schema = { 'uris': { 'canonicalUri': ('//str[@name="id"]/node()', compose('http://dx.doi.org/{}'.format, single_result)), }, 'contributors': ('//arr[@name="author_display"]/str/node()', default_name_parser), 'providerUpdatedDateTime': ('//date[@name="publication_data"]/node()', compose(lambda x: parse(x).date().isoformat(), single_result)), 'title': ('//str[@name="title_display"]/node()', single_result), 'description': ('//arr[@name="abstract"]/str/node()', single_result), 'publisher': { 'name': ('//str[@name="journal"]/node()', single_result) }, 'otherProperties': build_properties(('eissn', '//str[@name="eissn"]/node()'), ('articleType', '//str[@name="article_type"]/node()'), ('score', '//float[@name="score"]/node()')) }
isotope analysis.', 'providerUpdatedDateTime': '2015-02-23T00:00:00', 'shareProperties': { 'source': 'test' } } TEST_SCHEMA = updated_schema( DOESCHEMA, { "title": ("//dc:title/node()", lambda x: "Title overwritten"), "otherProperties": build_properties( ("title1", ("//dc:title/node()", single_result)), ("title2", ("//dc:title/node()", lambda x: single_result(x).lower())), ("title3", ("//dc:title/node()", "//dc:title/node()", lambda x, y: single_result(x) + single_result(y).lower()))) }) TEST_NAMESPACES = { 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'dc': 'http://purl.org/dc/elements/1.1/', 'dcq': 'http://purl.org/dc/terms/' } TEST_XML_DOC = ''' <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcq="http://purl.org/dc/terms/"> <records count="97" morepages="true" start="1" end="10"> <record rownumber="1">
class FigshareHarvester(JSONHarvester): short_name = 'figshare' long_name = 'figshare' url = 'http://figshare.com/' URL = 'http://api.figshare.com/v1/articles/search?search_for=*&from_date=' schema = { 'title': '/title', 'description': '/description', 'contributors': ('/authors', lambda x: default_name_parser([person['author_name'] for person in x])), 'providerUpdatedDateTime': ('/modified_date', date_formatter), 'uris': { 'canonicalUri': ('/DOI', lambda x: x[0] if isinstance(x, list) else x), 'providerUris': [ ('/url') ] }, 'otherProperties': build_properties( ('serviceID', ('/article_id', str)), ('definedType', '/defined_type'), ('type', '/type'), ('links', '/links'), ('publishedDate', '/published_date') ) } def harvest(self, start_date=None, end_date=None): """ Figshare should always have a 24 hour delay because they manually go through and check for test projects. Most of them are removed within 24 hours. So, we will shift everything back a day with harvesting to ensure nothing is harvested on the day of. """ start_date = start_date - timedelta(1) if start_date else date.today() - timedelta(1 + settings.DAYS_BACK) end_date = end_date - timedelta(1) if end_date else date.today() - timedelta(1) search_url = '{0}{1}&to_date={2}'.format( self.URL, start_date.isoformat(), end_date.isoformat() ) records = self.get_records(search_url) record_list = [] for record in records: doc_id = record['article_id'] record_list.append( RawDocument( { 'doc': json.dumps(record), 'source': self.short_name, 'docID': six.text_type(doc_id), 'filetype': 'json' } ) ) return record_list def get_records(self, search_url): records = requests.get(search_url) total_records = records.json()['items_found'] page = 1 all_records = [] while len(all_records) < total_records: record_list = records.json()['items'] for record in record_list: if len(all_records) < total_records: all_records.append(record) page += 1 records = requests.get(search_url + '&page={}'.format(str(page)), throttle=3) return all_records
def formatted_properties(self): return { 'otherProperties': build_properties( *list(map(self.format_property, self.property_list))) }
def formatted_properties(self): return { 'otherProperties': build_properties(*map(self.format_property, self.property_list)) }
class DataOneHarvester(XMLHarvester): short_name = 'dataone' long_name = 'DataONE: Data Observation Network for Earth' url = 'https://www.dataone.org/' namespaces = {} record_encoding = None schema = { 'otherProperties': build_properties( ('authorGivenName', ("str[@name='authorGivenName']/node()")), ('authorSurName', ("str[@name='authorSurName']/node()")), ('authoritativeMN', ("str[@name='authoritativeMN']/node()")), ('checksum', ("str[@name='checksum']/node()")), ('checksumAlgorithm', ("str[@name='checksumAlgorithm']/node()")), ('datasource', ("str[@name='datasource']/node()")), ('datePublished', ("date[@name='datePublished']/node()")), ('dateUploaded', ("date[@name='dateUploaded']/node()")), ('pubDate', ("date[@name='pubDate']/node()")), ('updateDate', ("date[@name='updateDate']/node()")), ('fileID', ("str[@name='fileID']/node()")), ('formatId', ("str[@name='formatId']/node()")), ('formatType', ("str[@name='formatType']/node()")), ('identifier', ("str[@name='identifier']/node()")), ('readPermission', "arr[@name='readPermission']/str/node()"), ('replicaMN', "arr[@name='replicaMN']/str/node()"), ('replicaVerifiedDate', "arr[@name='replicaVerifiedDate']/date/node()"), ('replicationAllowed', ("bool[@name='replicationAllowed']/node()")), ('numberReplicas', ("int[@name='numberReplicas']/node()")), ('preferredReplicationMN', "arr[@name='preferredReplicationMN']/str/node()"), ('rightsHolder', ("str[@name='rightsHolder']/node()")), ('scientificName', "arr[@name='scientificName']/str/node()"), ('site', "arr[@name='site']/str/node()"), ('size', ("long[@name='size']/node()")), ('isDocumentedBy', "arr[@name='isDocumentedBy']/str/node()"), ('serviceID', "str[@name='id']/node()"), ('sku', "str[@name='sku']/node()")), 'freeToRead': { 'startDate': ("bool[@name='isPublic']/node()", "date[@name='dateModified']/node()", lambda x, y: parse(y[0]).date().isoformat() if x else None) }, 'contributors': ("str[@name='author']/node()", "str[@name='submitter']/node()", "arr[@name='origin']/str/node()", "arr[@name='investigator']/str/node()", process_contributors), 'uris': ("str[@name='id']/node()", "//str[@name='dataUrl']/node()", "arr[@name='resourceMap']/str/node()", partial(helpers.oai_process_uris, use_doi=True)), 'tags': ("//arr[@name='keywords']/str/node()", lambda x: x if isinstance(x, list) else [x]), 'providerUpdatedDateTime': ("str[@name='dateModified']/node()", compose(datetime_formatter, single_result)), 'title': ("str[@name='title']/node()", single_result), 'description': ("str[@name='abstract']/node()", single_result) } def harvest(self, start_date=None, end_date=None): start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() records = self.get_records(start_date, end_date) xml_list = [] for record in records: # This ID is unique per data package, but won't unify multiple packages for the same project doc_id = record.xpath("str[@name='id']")[0].text format_type = record.xpath("str[@name='formatType']")[0].text record = ElementTree.tostring(record, encoding=self.record_encoding) if format_type.lower() != 'metadata': logger.info( 'Not normalizing record with ID {}, type {}'.format( doc_id, format_type)) else: xml_list.append( RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list def get_records(self, start_date, end_date): ''' helper function to get a response from the DataONE API, with the specified number of rows. Returns an etree element with results ''' query = 'dateModified:[{}T00:00:00Z TO {}T00:00:00Z]'.format( start_date.isoformat(), end_date.isoformat()) doc = requests.get(DATAONE_SOLR_ENDPOINT, params={ 'q': query, 'start': 0, 'rows': 1 }) doc = etree.XML(doc.content) rows = int(doc.xpath("//result/@numFound")[0]) n = 0 while n < rows: data = requests.get(DATAONE_SOLR_ENDPOINT, params={ 'q': query, 'start': n, 'rows': 1000 }) docs = etree.XML(data.content).xpath('//doc') for doc in docs: yield doc n += 1000