示例#1
0
 def schema(self):
     return {
         'contributors': ('/authorNames', process_contributors),
         'uris': {
             'canonicalUri': '/articleFullUrl',
             'objectUris': ('/doi', lambda x: ['http://dx.doi.org/' + x])
         },
         'title':
         ('/bibliographyTitle', '/blurbTitle', lambda x, y: x or y),
         'providerUpdatedDateTime':
         ('/published Date', lambda x: parse(x).isoformat()),
         'description':
         '/blurbText',
         'freeToRead': {
             'startDate': ('/is_free', '/published Date', lambda x, y: y
                           if x else None)
         },
         'otherProperties':
         build_properties(
             ('imageURL', '/imageUrl', {
                 'description': 'a image url'
             }), ('type', '/type'), ('isOpenAccess', '/isOpenAccess'),
             ('articleUrl', '/articleUrl'),
             ('articleFullUrl', '/articleFullUrl'), ('isFree', '/isFree'),
             ('isHighlyAccessed', '/isHighlyAccessed'),
             ('status', '/status'), ('abstractPath', '/abstractPath'),
             ('journal Id', '/journal Id'),
             ('article_host', '/article_host'),
             ('longCitation', '/longCitation'),
             ('is_subscription', '/is_subscription'))
     }
示例#2
0
文件: vivo.py 项目: zamattiac/scrapi
 def schema(self):
     return {
         'title': ('/title', lambda x: x if x else ''),
         'providerUpdatedDateTime': ('/date', datetime_formatter),
         'uris': {
             'canonicalUri': '/uri',
             'providerUris': ('/uri', lambda x: [x]),
             'objectUris': ('/pmid', '/doi', process_object_uris)
         },
         'contributors':
         '/authors',
         'subjects':
         '/subjects',
         'tags':
         '/keywords',
         'publisher': ('/publisher', lambda x: {
             'name': x
         } if x else ''),
         'otherProperties':
         build_properties(
             ('journalTitle', '/journalTitle'),
             ('abstract', ('/abstract', lambda x: x if x else '')),
             ('type', '/types'),
             ('ISSN', ('/issn', lambda x: x if x else '')),
             ('number', '/number'),
             ('ISBN', '/isbn'),
             ('startPage', '/startPage'),
             ('endPage', '/endPage'),
             ('volume', '/volume'),
         )
     }
示例#3
0
    def test_arg_kwargs(self):
        def process_title(title, title1="test"):
            return title[0] + (title1[0] if isinstance(title1, list) else title1)

        def process_title2(title1="test"):
            return title1[0] if isinstance(title1, list) else title1

        args = ("//dc:title/node()", )
        kwargs = {"title1": "//dc:title/node()"}

        self.harvester.schema = updated_schema(
            TEST_SCHEMA,
            {
                'title': (pack(*args, **kwargs), process_title),
                'otherProperties': build_properties(
                    ('title2', (pack(*args), process_title)),
                    ('title3', (pack(**kwargs), process_title2)),
                    ('title4', (pack('//dc:title/node()', title1='//dc:title/node()'), process_title))
                )
            }
        )


        results = [self.harvester.normalize(record) for record in self.harvester.harvest(days_back=1)]

        for result in results:
            assert result['title'] == "TestTest"
            assert result['otherProperties'][0]['properties']['title2'] == 'Testtest'
            assert result['otherProperties'][1]['properties']['title3'] == 'Test'
            assert result['otherProperties'][2]['properties']['title4'] == "TestTest"
示例#4
0
 def schema(self):
     return {
         'title': ('/title', lambda x: x or ''),
         'description': ('/notes'),
         'providerUpdatedDateTime': ('/metadata_modified', datetime_formatter),
         'uris': {
             'canonicalUri': ('/name', lambda x: construct_url(self.url, self.dataset_path, x)),  # Construct new urls directing to LWBIN
             'objectUris': ('/url', '/extras', process_object_uris)  # Default urls from the metadata directing to source pages
         },
         'contributors': ('/author', '/author_email', process_contributors),
         'licenses': ('/license_title', '/license_url', '/license_id', process_licenses),
         'tags': ('/tags', lambda x: [tag['name'].lower() for tag in (x or [])]),
         'freeToRead': {
             'startDate': ('/isopen', '/metadata_created', lambda x, y: parse(y).date().isoformat() if x else None)
         },
         'otherProperties': build_properties(
             ('maintainer', '/maintainer'),
             ('maintainerEmail', '/maintainer_email'),
             ('revisionTimestamp', ('/revision_timestamp', datetime_formatter)),
             ('id', '/id'),
             ('metadataCreated', ('/metadata_created', datetime_formatter)),
             ('state', '/state'),
             ('version', '/version'),
             ('creatorUserId', '/creator_user_id'),
             ('type', '/type'),
             ('numberOfResources', '/num_resources'),
             ('numberOfTags', '/num_tags'),
             ('name', '/name'),
             ('groups', '/groups'),
         )
     }
示例#5
0
 def schema(self):
     return {
         'contributors':
         ('/creators',
          compose(default_name_parser,
                  lambda authors: [author['creator']
                                   for author in authors])),
         'uris': ('/url', process_urls),
         'title':
         '/title',
         'providerUpdatedDateTime':
         ('/publicationDate', datetime_formatter),
         'description':
         '/abstract',
         'freeToRead': {
             'startDate': ('/openaccess', '/publicationDate', lambda x, y: y
                           if x == 'true' else None)
         },
         'publisher': {
             'name': '/publisher'
         },
         'subjects': ('/genre', lambda x: [x] if x else []),
         'otherProperties':
         build_properties(
             ('url', '/url'), ('doi', '/doi'), ('isbn', '/isbn'),
             ('printIsbn', '/printIsbn'),
             ('electronicIsbn', '/electronicIsbn'), ('volume', '/volume'),
             ('number', '/number'), ('startingPage', '/startingPage'),
             ('copyright', '/copyright'), ('identifier', '/identifier'))
     }
示例#6
0
 def schema(self):
     return {
         'title': ('/title', lambda x: x if x else ''),
         'providerUpdatedDateTime': ('/date', datetime_formatter),
         'uris': {
             'canonicalUri': '/uri',
             'providerUris': ('/uri', lambda x: [x]),
             'objectUris': ('/pmid', '/doi', process_object_uris)
         },
         'contributors': '/authors',
         'subjects': '/subjects',
         'tags': '/keywords',
         'publisher': ('/publisher', lambda x: {'name': x} if x else ''),
         'otherProperties': build_properties(
             ('journalTitle', '/journalTitle'),
             ('abstract', ('/abstract', lambda x: x if x else '')),
             ('type', '/types'),
             ('ISSN', ('/issn', lambda x: x if x else '')),
             ('number', '/number'),
             ('ISBN', '/isbn'),
             ('startPage', '/startPage'),
             ('endPage', '/endPage'),
             ('volume', '/volume'),
         )
     }
示例#7
0
文件: springer.py 项目: kms6bn/scrapi
 def schema(self):
     return {
         'contributors': (
             '/creators',
             compose(
                 default_name_parser,
                 lambda authors: [author['creator'] for author in authors]
             )
         ),
         'uris': ('/url', process_urls),
         'title': '/title',
         'providerUpdatedDateTime': ('/publicationDate', datetime_formatter),
         'description': '/abstract',
         'freeToRead': {
             'startDate': ('/openaccess', '/publicationDate', lambda x, y: y if x == 'true' else None)
         },
         'publisher': {
             'name': '/publisher'
         },
         'subjects': ('/genre', lambda x: [x] if x else []),
         'otherProperties': build_properties(
             ('url', '/url'),
             ('doi', '/doi'),
             ('isbn', '/isbn'),
             ('printIsbn', '/printIsbn'),
             ('electronicIsbn', '/electronicIsbn'),
             ('volume', '/volume'),
             ('number', '/number'),
             ('startingPage', '/startingPage'),
             ('copyright', '/copyright'),
             ('identifier', '/identifier')
         )
     }
示例#8
0
 def schema(self):
     return {
         'title': ('/title', lambda x: x[0] if x else ''),
         'description': ('/subtitle', lambda x: x[0]
                         if (isinstance(x, list) and x) else x or ''),
         'providerUpdatedDateTime':
         ('/issued/date-parts', lambda x: parse(' '.join(
             [str(part)
              for part in x[0]])).date().isoformat().decode('utf-8')),
         'uris': {
             'canonicalUri': '/URL'
         },
         'contributors': ('/author', lambda x: [
             process_contributor(*[
                 '{} {}'.format(entry.get('given'), entry.get('family')),
                 entry.get('ORCID')
             ]) for entry in x
         ]),
         'otherProperties':
         build_properties(
             ('journalTitle', '/container-title'), ('volume', '/volume'),
             ('tags',
              ('/subject', '/container-title',
               lambda x, y: [tag.lower() for tag in (x or []) +
                             (y or [])])), ('issue', '/issue'),
             ('publisher', '/publisher'), ('type', '/type'),
             ('ISSN', '/ISSN'), ('ISBN', '/ISBN'), ('member', '/member'),
             ('score', '/score'), ('issued', '/issued'),
             ('deposited', '/deposited'), ('indexed', '/indexed'),
             ('page', '/page'), ('issue', '/issue'), ('volume', '/volume'),
             ('referenceCount', '/reference-count'),
             ('updatePolicy', '/update-policy'),
             ('depositedTimestamp', '/deposited/timestamp'))
     }
示例#9
0
 def schema(self):
     return {
         'contributors': ('/authorNames', process_contributors),
         'uris': {
             'canonicalUri': '/articleFullUrl',
             'objectUris': ('/doi', lambda x: ['http://dx.doi.org/' + x])
         },
         'title': ('/bibliographyTitle', '/blurbTitle', lambda x, y: x or y),
         'providerUpdatedDateTime': ('/published Date', lambda x: parse(x).isoformat()),
         'description': '/blurbText',
         'freeToRead': {
             'startDate': ('/is_free', '/published Date', lambda x, y: y if x else None)
         },
         'otherProperties': build_properties(
             ('imageURL', '/imageUrl', {'description': 'a image url'}),
             ('type', '/type'),
             ('isOpenAccess', '/isOpenAccess'),
             ('articleUrl', '/articleUrl'),
             ('articleFullUrl', '/articleFullUrl'),
             ('isFree', '/isFree'),
             ('isHighlyAccessed', '/isHighlyAccessed'),
             ('status', '/status'),
             ('abstractPath', '/abstractPath'),
             ('journal Id', '/journal Id'),
             ('article_host', '/article_host'),
             ('longCitation', '/longCitation'),
             ('is_subscription', '/is_subscription')
         )
     }
示例#10
0
 def formatted_properties(self):
     return {
         'otherProperties':
         build_properties(*[(item, ('//dc:{}/node()'.format(item),
                                    '//ns0:{}/node()'.format(item),
                                    self.resolve_property))
                            for item in self.property_list])
     }
示例#11
0
 def formatted_properties(self):
     return {
         'otherProperties': build_properties(*[(item, (
             '//dc:{}/node()'.format(item),
             '//ns0:{}/node()'.format(item),
             self.resolve_property)
         ) for item in self.property_list])
     }
示例#12
0
文件: nih.py 项目: zamattiac/scrapi
 def schema(self):
     return {
         "contributors":
         ('//PIS/PI/PI_NAME/node()', '//ORG_NAME', nih_name_parser),
         "uris": {
             "canonicalUri": ("//APPLICATION_ID/node()",
                              compose(self.construct_project_url,
                                      single_result)),
             "descriptorUris":
             ("//APPLICATION_ID/node()", "//FOA_NUMBER/node()",
              self.construct_descriptor_uris)
         },
         "providerUpdatedDateTime": ("AWARD_NOTICE_DATE/node()",
                                     compose(datetime_formatter,
                                             single_result)),
         "title": ('//PROJECT_TITLE/node()', single_result),
         "tags": ('//PROJECT_TERMSX/TERM/node()'),
         "otherProperties":
         build_properties(
             ("applicationID", "//APPLICATION_ID/node()"),
             ('activity', '//ACTIVITY/node()'),
             ('administeringIC', '//ADMINISTERING_IC/node()'),
             ('arraFunded', '//ARRA_FUNDED/node()'),
             ('budgetStart', '//BUDGET_START/node()'),
             ('budgetEnd', '//BUDGET_END/node()'),
             ('FOANumber', '//FOA_NUMBER/node()'),
             ('fullProjectNumber', '//FULL_PROJECT_NUM/node()'),
             ('fundingICs', '//FUNDING_ICs/node()'),
             ('fiscalYear', '//FY/node()'),
             ('NIHSpendingCats', '//NIH_SPENDING_CATS/@xsi:nil'),
             ('organizationCity', '//ORG_CITY/node()'),
             ('organizationCountry', '//ORG_CONTRY/node()'),
             ('organizationDistrict', '//ORG_DISTRICT/node()'),
             ('organizationDUNS', '//ORG_DUNS/node()'),
             ('organizationDept', '//ORG_DEPT/node()'),
             ('organizationFIPS', '//ORG_FIPS/node()'),
             ('organizationState', '//ORG_STATE/node()'),
             ('organizationZipcode', '//ORG_ZIPCODE/node()'),
             ('ICName', '//IC_NAME/node()'), ('organizationName',
                                              '//ORG_NAME/node()'),
             ('projectStart', '//PROJECT_START/node()'),
             ('projectEnd', '//PROJECT_END/node()'),
             ('PHR', '//PHR/node()'), ('serialNumber',
                                       '//SERIAL_NUMBER/node()'),
             ('studySection', '//STUDY_SECTION/node()'),
             ('studySectionName', '//STUDY_SECTION_NAME/node()'),
             ('supportYear', '//SUPPORT_YEAR/node()'),
             ('suffix', '//SUFFIX/node()'), ('subProjectID',
                                             '//SUBPROJECT_ID/@xsi:nil'),
             ('totalCost', '//TOTAL_COST/node()'),
             ('totalCostSubProject', '//TOTAL_COST_SUB_PROJECT/node()'),
             ('coreProjectNumber', '//CORE_PROJECT_NUM/node()'),
             ('CFDACode', '//CFDA_CODE/node()'),
             ('programOfficerName', '//PROGRAM_OFFICER_NAME/node()'),
             ('edInstType', '//ED_INST_TYPE/node()'),
             ('awardNoticeDate', '//AWARD_NOTICE_DATE/node()'),
             ('fundingMechanism', '//FUNDING_MECHANISM/node()'))
     }
示例#13
0
    def schema(self):
        properties = {
            'otherProperties':
            build_properties(*[(item, ('//dc:{}/node()'.format(item),
                                       '//ns0:{}/node()'.format(item),
                                       self.resolve_property))
                               for item in self.property_list])
        }

        return updated_schema(OAISCHEMA, properties)
示例#14
0
    def schema(self):
        properties = {
            'otherProperties': build_properties(*[(item, (
                '//dc:{}/node()'.format(item),
                '//ns0:{}/node()'.format(item),
                self.resolve_property)
            ) for item in self.property_list])
        }

        return updated_schema(OAISCHEMA, properties)
示例#15
0
 def schema(self):
     return {
         "contributors": ('//PIS/PI/PI_NAME/node()', '//ORG_NAME', nih_name_parser),
         "uris": {
             "canonicalUri": ("//APPLICATION_ID/node()", compose(self.construct_project_url, single_result)),
             "descriptorUris": ("//APPLICATION_ID/node()", "//FOA_NUMBER/node()",
                                self.construct_descriptor_uris)
         },
         "providerUpdatedDateTime": ("AWARD_NOTICE_DATE/node()", compose(datetime_formatter, single_result)),
         "title": ('//PROJECT_TITLE/node()', single_result),
         "tags": ('//PROJECT_TERMSX/TERM/node()'),
         "otherProperties": build_properties(
             ("applicationID", "//APPLICATION_ID/node()"),
             ('activity', '//ACTIVITY/node()'),
             ('administeringIC', '//ADMINISTERING_IC/node()'),
             ('arraFunded', '//ARRA_FUNDED/node()'),
             ('budgetStart', '//BUDGET_START/node()'),
             ('budgetEnd', '//BUDGET_END/node()'),
             ('FOANumber', '//FOA_NUMBER/node()'),
             ('fullProjectNumber', '//FULL_PROJECT_NUM/node()'),
             ('fundingICs', '//FUNDING_ICs/node()'),
             ('fiscalYear', '//FY/node()'),
             ('NIHSpendingCats', '//NIH_SPENDING_CATS/@xsi:nil'),
             ('organizationCity', '//ORG_CITY/node()'),
             ('organizationCountry', '//ORG_CONTRY/node()'),
             ('organizationDistrict', '//ORG_DISTRICT/node()'),
             ('organizationDUNS', '//ORG_DUNS/node()'),
             ('organizationDept', '//ORG_DEPT/node()'),
             ('organizationFIPS', '//ORG_FIPS/node()'),
             ('organizationState', '//ORG_STATE/node()'),
             ('organizationZipcode', '//ORG_ZIPCODE/node()'),
             ('ICName', '//IC_NAME/node()'),
             ('organizationName', '//ORG_NAME/node()'),
             ('projectStart', '//PROJECT_START/node()'),
             ('projectEnd', '//PROJECT_END/node()'),
             ('PHR', '//PHR/node()'),
             ('serialNumber', '//SERIAL_NUMBER/node()'),
             ('studySection', '//STUDY_SECTION/node()'),
             ('studySectionName', '//STUDY_SECTION_NAME/node()'),
             ('supportYear', '//SUPPORT_YEAR/node()'),
             ('suffix', '//SUFFIX/node()'),
             ('subProjectID', '//SUBPROJECT_ID/@xsi:nil'),
             ('totalCost', '//TOTAL_COST/node()'),
             ('totalCostSubProject', '//TOTAL_COST_SUB_PROJECT/node()'),
             ('coreProjectNumber', '//CORE_PROJECT_NUM/node()'),
             ('CFDACode', '//CFDA_CODE/node()'),
             ('programOfficerName', '//PROGRAM_OFFICER_NAME/node()'),
             ('edInstType', '//ED_INST_TYPE/node()'),
             ('awardNoticeDate', '//AWARD_NOTICE_DATE/node()'),
             ('fundingMechanism', '//FUNDING_MECHANISM/node()')
         )
     }
示例#16
0
 def schema(self):
     return {
         'contributors': ('/authors', process_contributors),
         'uris': {
             'objectUris': ('/url', '/full_dataset_url', compose(filter_none, lambda x, y: [x, y])),
             'descriptorUris': ('/DOI', '/paper_url', compose(filter_none, lambda x, y: [('http://dx.doi.org/{}'.format(x) if x else None), y])),
             'canonicalUri': '/url',
         },
         'title': '/name',
         'providerUpdatedDateTime': ('/modify_date', datetime_formatter),
         'description': '/description',
         'otherProperties': build_properties(
             ('owner_name', '/owner_name'),
         )
     }
示例#17
0
 def schema(self):
     return {
         'contributors': ('/contributors', process_contributors),
         'title': ('/title', process_null),
         'providerUpdatedDateTime': ('/date_created', parse_date),
         'description': ('/description', process_null),
         'uris': {
             'canonicalUri': ('/url', lambda x: 'http://osf.io' + x),
         },
         'tags': ('/tags', process_tags),
         'otherProperties':
         build_properties(
             ('parent_title', '/parent_title'), ('category', '/category'),
             ('wiki_link', '/wiki_link'), ('is_component', '/is_component'),
             ('is_registration', '/is_registration'),
             ('parent_url', '/parent_url'), ('journal Id', '/journal Id'))
     }
示例#18
0
 def schema(self):
     return {
         "title": ("/title", lambda x: x[0] if x else ""),
         "description": ("/subtitle", lambda x: x[0] if (isinstance(x, list) and x) else x or ""),
         "providerUpdatedDateTime": (
             "/issued/date-parts",
             lambda x: parse(" ".join([str(part) for part in x[0]])).date().isoformat(),
         ),
         "uris": {"canonicalUri": "/URL"},
         "contributors": (
             "/author",
             compose(
                 lambda x: [
                     process_contributor(
                         *["{} {}".format(entry.get("given"), entry.get("family")), entry.get("ORCID")]
                     )
                     for entry in x
                 ],
                 lambda x: x or [],
             ),
         ),
         "sponsorships": ("/funder", lambda x: process_sponsorships(x) if x else []),
         "otherProperties": build_properties(
             ("journalTitle", "/container-title"),
             ("volume", "/volume"),
             ("tags", ("/subject", "/container-title", lambda x, y: [tag.lower() for tag in (x or []) + (y or [])])),
             ("issue", "/issue"),
             ("publisher", "/publisher"),
             ("type", "/type"),
             ("ISSN", "/ISSN"),
             ("ISBN", "/ISBN"),
             ("member", "/member"),
             ("score", "/score"),
             ("issued", "/issued"),
             ("deposited", "/deposited"),
             ("indexed", "/indexed"),
             ("page", "/page"),
             ("issue", "/issue"),
             ("volume", "/volume"),
             ("referenceCount", "/reference-count"),
             ("updatePolicy", "/update-policy"),
             ("depositedTimestamp", "/deposited/timestamp"),
         ),
     }
示例#19
0
文件: lwbin.py 项目: zamattiac/scrapi
 def schema(self):
     return {
         'title': ('/title', lambda x: x or ''),
         'description': ('/notes'),
         'providerUpdatedDateTime':
         ('/metadata_modified', datetime_formatter),
         'uris': {
             'canonicalUri':
             ('/name',
              lambda x: construct_url(self.url, self.dataset_path, x)
              ),  # Construct new urls directing to LWBIN
             'objectUris':
             ('/url', '/extras', process_object_uris
              )  # Default urls from the metadata directing to source pages
         },
         'contributors': ('/author', '/author_email', process_contributors),
         'licenses': ('/license_title', '/license_url', '/license_id',
                      process_licenses),
         'tags':
         ('/tags', lambda x: [tag['name'].lower() for tag in (x or [])]),
         'freeToRead': {
             'startDate': ('/isopen', '/metadata_created',
                           lambda x, y: parse(y).date().isoformat()
                           if x else None)
         },
         'otherProperties':
         build_properties(
             ('maintainer', '/maintainer'),
             ('maintainerEmail', '/maintainer_email'),
             ('revisionTimestamp',
              ('/revision_timestamp', datetime_formatter)),
             ('id', '/id'),
             ('metadataCreated', ('/metadata_created', datetime_formatter)),
             ('state', '/state'),
             ('version', '/version'),
             ('creatorUserId', '/creator_user_id'),
             ('type', '/type'),
             ('numberOfResources', '/num_resources'),
             ('numberOfTags', '/num_tags'),
             ('name', '/name'),
             ('groups', '/groups'),
         )
     }
示例#20
0
文件: osf.py 项目: NeuroVault/scrapi
 def schema(self):
     return {
         'contributors': ('/contributors', process_contributors),
         'title': ('/title', process_null),
         'providerUpdatedDateTime': ('/date_registered', date_formatter),
         'description': ('/description', process_null),
         'uris': {
             'canonicalUri': ('/url', lambda x: 'http://osf.io' + x),
         },
         'tags': ('/tags', process_tags),
         'otherProperties': build_properties(
             ('parent_title', '/parent_title'),
             ('category', '/category'),
             ('wiki_link', '/wiki_link'),
             ('is_component', '/is_component'),
             ('is_registration', '/is_registration'),
             ('parent_url', '/parent_url'),
             ('journal Id', '/journal Id')
         )
     }
示例#21
0
文件: osf.py 项目: zamattiac/scrapi
 def schema(self):
     return {
         'contributors': ('/contributors', process_contributors),
         'title': ('/title', lambda x: x or ''),
         'providerUpdatedDateTime': ('/date_registered', datetime_formatter),
         'description': '/description',
         'uris': {
             'canonicalUri': ('/url', url_from_guid),
             'providerUris': ('/url', compose(coerce_to_list, url_from_guid))
         },
         'tags': '/tags',
         'otherProperties': build_properties(
             ('parent_title', '/parent_title'),
             ('category', '/category'),
             ('wiki_link', '/wiki_link'),
             ('is_component', '/is_component'),
             ('is_registration', '/is_registration'),
             ('parent_url', '/parent_url'),
             ('journal Id', '/journal Id')
         )
     }
示例#22
0
 def schema(self):
     return {
         'contributors': ('/authors', process_contributors),
         'uris': {
             'objectUris': ('/url', '/full_dataset_url',
                            compose(filter_none, lambda x, y: [x, y])),
             'descriptorUris':
             ('/DOI', '/paper_url',
              compose(
                  filter_none,
                  lambda x, y: [('http://dx.doi.org/{}'.format(x)
                                 if x else None), y])),
             'canonicalUri':
             '/url',
         },
         'title': '/name',
         'providerUpdatedDateTime': ('/modify_date', datetime_formatter),
         'description': '/description',
         'otherProperties': build_properties(
             ('owner_name', '/owner_name'), )
     }
 def schema(self):
     return {
         'title': ('/title', lambda x: x[0] if x else ''),
         'description': ('/subtitle', lambda x: x[0] if (isinstance(x, list) and x) else x or ''),
         'providerUpdatedDateTime': ('/issued/date-parts', lambda x: parse(' '.join([part for part in x[0]])).isoformat().decode('utf-8')),
         'uris': {
             'canonicalUri': '/URL'
         },
         'contributors': ('/author', lambda x: [
             process_contributor(*[
                 '{} {}'.format(entry.get('given'), entry.get('family')),
                 entry.get('ORCID')
             ]) for entry in x
         ]),
         'otherProperties': build_properties(
             ('referenceCount', '/reference-count'),
             ('updatePolicy', '/update-policy'),
             ('depositedTimestamp', '/deposited/timestamp'),
             ('Empty', '/trash/not-here'),
             ('Empty2', '/')
         )
     }
示例#24
0
 def schema(self):
     return {
         'title': ('/title', lambda x: x[0] if x else ''),
         'description': ('/subtitle', lambda x: x[0] if (isinstance(x, list) and x) else x or ''),
         'providerUpdatedDateTime': ('/issued/date-parts',
                                     compose(datetime_formatter, lambda x: ' '.join([str(part) for part in x[0]]))),
         'uris': {
             'canonicalUri': '/URL'
         },
         'contributors': ('/author', compose(lambda x: [
             process_contributor(*[
                 '{} {}'.format(entry.get('given'), entry.get('family')),
                 entry.get('ORCID')
             ]) for entry in x
         ], lambda x: x or [])),
         'sponsorships': ('/funder', lambda x: process_sponsorships(x) if x else []),
         'tags': ('/subject', '/container-title', lambda x, y: [tag.lower() for tag in (x or []) + (y or [])]),
         'subjects': ('/subject', '/container-title', lambda x, y: [tag.lower() for tag in (x or []) + (y or [])]),
         'otherProperties': build_properties(
             ('journalTitle', '/container-title'),
             ('volume', '/volume'),
             ('issue', '/issue'),
             ('publisher', '/publisher'),
             ('type', '/type'),
             ('ISSN', '/ISSN'),
             ('ISBN', '/ISBN'),
             ('member', '/member'),
             ('score', '/score'),
             ('issued', '/issued'),
             ('deposited', '/deposited'),
             ('indexed', '/indexed'),
             ('page', '/page'),
             ('issue', '/issue'),
             ('volume', '/volume'),
             ('referenceCount', '/reference-count'),
             ('updatePolicy', '/update-policy'),
             ('depositedTimestamp', '/deposited/timestamp')
         )
     }
示例#25
0
 def schema(self):
     return {
         'title': ('/title', lambda x: x[0] if x else ''),
         'description': ('/subtitle', lambda x: x[0]
                         if (isinstance(x, list) and x) else x or ''),
         'providerUpdatedDateTime':
         ('/issued/date-parts',
          lambda x: datetime_formatter(' '.join([part for part in x[0]]))),
         'uris': {
             'canonicalUri': '/URL'
         },
         'contributors': ('/author', lambda x: [
             process_contributor(*[
                 '{} {}'.format(entry.get('given'), entry.get('family')),
                 entry.get('ORCID')
             ]) for entry in x
         ]),
         'otherProperties':
         build_properties(('referenceCount', '/reference-count'),
                          ('updatePolicy', '/update-policy'),
                          ('depositedTimestamp', '/deposited/timestamp'),
                          ('Empty', '/trash/not-here'), ('Empty2', '/'))
     }
示例#26
0
    def test_arg_kwargs(self):
        def process_title(title, title1="test"):
            return title[0] + (title1[0]
                               if isinstance(title1, list) else title1)

        def process_title2(title1="test"):
            return title1[0] if isinstance(title1, list) else title1

        args = ("//dc:title/node()", )
        kwargs = {"title1": "//dc:title/node()"}

        self.harvester.schema = updated_schema(
            TEST_SCHEMA, {
                'title': (pack(*args, **kwargs), process_title),
                'otherProperties':
                build_properties(
                    ('title2', (pack(*args), process_title)),
                    ('title3', (pack(**kwargs), process_title2)),
                    ('title4',
                     (pack('//dc:title/node()',
                           title1='//dc:title/node()'), process_title)))
            })

        results = [
            self.harvester.normalize(record)
            for record in self.harvester.harvest(days_back=1)
        ]

        for result in results:
            assert result['title'] == "TestTest"
            assert result['otherProperties'][0]['properties'][
                'title2'] == 'Testtest'
            assert result['otherProperties'][1]['properties'][
                'title3'] == 'Test'
            assert result['otherProperties'][2]['properties'][
                'title4'] == "TestTest"
示例#27
0
class USGSHarvester(JSONHarvester):
    short_name = 'usgs'
    long_name = 'United States Geological Survey'
    url = 'https://pubs.er.usgs.gov/'
    DEFAULT_ENCODING = 'UTF-8'

    URL = 'https://pubs.er.usgs.gov/pubs-services/publication?'

    schema = {
        'title':
        '/title',
        'description':
        '/docAbstract',
        'providerUpdatedDateTime': ('/lastModifiedDate', datetime_formatter),
        'uris': {
            'canonicalUri':
            ('/id', 'https://pubs.er.usgs.gov/publication/{}'.format),
            'providerUris':
            [('/id', 'https://pubs.er.usgs.gov/publication/{}'.format)],
            'descriptorUris': [('/doi', 'https://dx.doi.org/{}'.format)]
        },
        'contributors': ('/contributors/authors', process_contributors),
        'otherProperties':
        build_properties(
            ('serviceID', ('/id', str)),
            ('definedType', '/defined_type'),
            ('type', '/type'),
            ('links', '/links'),
            ('publisher', '/publisher'),
            ('publishedDate', '/displayToPublicDate'),
            ('publicationYear', '/publicationYear'),
            ('issue', '/issue'),
            ('volume', '/volume'),
            ('language', '/language'),
            ('indexId', '/indexId'),
            ('publicationSubtype', '/publicationSubtype'),
            ('startPage', '/startPage'),
            ('endPage', '/endPage'),
            ('onlineOnly', '/onlineOnly'),
            ('additionalOnlineFiles', '/additionalOnlineFiles'),
            ('country', '/country'),
            ('state', '/state'),
            ('ipdsId', '/ipdsId'),
            ('links', '/links'),
            ('doi', '/doi'),
            ('contributors', '/contributors'),
            ('otherGeospatial', '/otherGeospatial'),
            ('geographicExtents', '/geographicExtents'),
        )
    }

    def harvest(self, start_date=None, end_date=None):

        # This API does not support date ranges
        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)

        # days_back = the number of days between start_date and now, defaulting to settings.DAYS_BACK
        days_back = settings.DAYS_BACK
        search_url = '{0}mod_x_days={1}'.format(self.URL, days_back)

        record_list = []
        for record in self.get_records(search_url):
            doc_id = record['id']

            record_list.append(
                RawDocument({
                    'doc': json.dumps(record),
                    'source': self.short_name,
                    'docID': six.text_type(doc_id),
                    'filetype': 'json'
                }))

        return record_list

    def get_records(self, search_url):
        records = requests.get(search_url)
        total_records = records.json()['recordCount']
        logger.info('Harvesting {} records'.format(total_records))
        page_number = 1
        count = 0

        while records.json()['records']:
            record_list = records.json()['records']
            for record in record_list:
                count += 1
                yield record

            page_number += 1
            records = requests.get(search_url +
                                   '&page_number={}'.format(page_number),
                                   throttle=3)
            logger.info('{} documents harvested'.format(count))
示例#28
0
class ClinicalTrialsHarvester(XMLHarvester):

    short_name = 'clinicaltrials'
    long_name = 'ClinicalTrials.gov'
    url = 'https://clinicaltrials.gov/'

    DEFAULT_ENCODING = 'UTF-8'
    record_encoding = None

    # TODO - clinicaltrials elements have a lot of extra metadata - at some
    # point in the future we should do a more thorough audit.
    schema = {
        "contributors":
        ('//overall_official/last_name/node()', default_name_parser),
        "uris": {
            "canonicalUri": ("//required_header/url/node()", single_result)
        },
        "providerUpdatedDateTime":
        ("lastchanged_date/node()", compose(datetime_formatter,
                                            single_result)),
        "title": ('//official_title/node()', '//brief_title/node()',
                  lambda x, y: single_result(x) or single_result(y)),
        "description": ('//brief_summary/textblock/node()',
                        '//brief_summary/textblock/node()',
                        lambda x, y: single_result(x) or single_result(y)),
        "tags":
        ("//keyword/node()", lambda tags: [tag.lower() for tag in tags]),
        "sponsorships": [{
            "sponsor": {
                "sponsorName":
                ("//sponsors/lead_sponsor/agency/node()", single_result)
            }
        }, {
            "sponsor": {
                "sponsorName":
                ("//sponsors/collaborator/agency/node()", single_result)
            }
        }],
        "otherProperties":
        build_properties(
            ("serviceID", "//nct_id/node()"),
            ('oversightAuthority', '//oversight_info/authority/node()'),
            ('studyDesign', '//study_design/node()'),
            ('numberOfArms', '//number_of_arms/node()'),
            ('source', '//source/node()'),
            ('verificationDate', '//verification_date/node()'),
            ('lastChanged', '//lastchanged_date/node()'),
            ('condition', '//condition/node()'),
            ('verificationDate', '//verification_date/node()'),
            ('lastChanged', '//lastchanged_date/node()'),
            ('status', '//status/node()'),
            ('locationCountries', '//location_countries/country/node()'),
            ('isFDARegulated', '//is_fda_regulated/node()'),
            ('isSection801', '//is_section_801/node()'),
            ('hasExpandedAccess', '//has_expanded_access/node()'),
            ('leadSponsorAgencyClass', '//lead_sponsor/agency_class/node()'),
            ('collaborator', '//collaborator/agency/node()'),
            ('collaboratorAgencyClass', '//collaborator/agency_class/node()'),
            ('measure', '//primary_outcome/measure/node()'),
            ('timeFrame', '//primary_outcome/time_frame/node()'),
            ('safetyIssue', '//primary_outcome/safety_issue/node()'),
            ('secondaryOutcomes', '//secondary_outcome/measure/node()'),
            ('enrollment', '//enrollment/node()'),
            ('armGroup', '//arm_group/arm_group_label/node()'),
            ('intervention', '//intervention/intervention_type/node()'),
            ('eligibility', ('//eligibility/node()',
                             compose(lambda x: list(map(element_to_dict, x)),
                                     lambda x: list(filter(non_string, x))))),
            ('link', '//link/url/node()'),
            ('responsible_party',
             '//responsible_party/responsible_party_full_name/node()'))
    }

    @property
    def namespaces(self):
        return None

    def harvest(self, start_date=None, end_date=None):
        """ First, get a list of all recently updated study urls,
        then get the xml one by one and save it into a list
        of docs including other information """

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        end_month = end_date.strftime('%m')
        end_day = end_date.strftime('%d')
        end_year = end_date.strftime('%Y')

        start_month = start_date.strftime('%m')
        start_day = start_date.strftime('%d')
        start_year = start_date.strftime('%Y')

        base_url = 'http://clinicaltrials.gov/ct2/results?lup_s='
        url_end = '{}%2F{}%2F{}&lup_e={}%2F{}%2F{}&displayxml=true'.\
            format(start_month, start_day, start_year, end_month, end_day, end_year)

        url = base_url + url_end

        # grab the total number of studies
        initial_request = requests.get(url)
        record_encoding = initial_request.encoding
        initial_request_xml = etree.XML(initial_request.content)
        count = int(initial_request_xml.xpath('//search_results/@count')[0])
        xml_list = []
        if int(count) > 0:
            # get a new url with all results in it
            url = url + '&count=' + str(count)
            total_requests = requests.get(url)
            initial_doc = etree.XML(total_requests.content)

            # make a list of urls from that full list of studies
            study_urls = []
            for study in initial_doc.xpath('//clinical_study'):
                study_urls.append(
                    study.xpath('url/node()')[0] + '?displayxml=true')

            # grab each of those urls for full content
            logger.info("There are {} urls to harvest - be patient...".format(
                len(study_urls)))
            count = 0
            official_count = 0
            for study_url in study_urls:
                try:
                    content = requests.get(study_url)
                except requests.exceptions.ConnectionError as e:
                    logger.info(
                        'Connection error: {}, wait a bit...'.format(e))
                    time.sleep(30)
                    continue
                doc = etree.XML(content.content)
                record = etree.tostring(doc, encoding=record_encoding)
                doc_id = doc.xpath('//nct_id/node()')[0]
                xml_list.append(
                    RawDocument({
                        'doc': record,
                        'source': self.short_name,
                        'docID': copy_to_unicode(doc_id),
                        'filetype': 'xml',
                    }))
                official_count += 1
                count += 1
                if count % 100 == 0:
                    logger.info(
                        "You've requested {} studies, keep going!".format(
                            official_count))
                    count = 0

        return xml_list
示例#29
0
class HarvardDataverseHarvester(JSONHarvester):
    short_name = 'harvarddataverse'
    long_name = 'Harvard Dataverse'
    url = 'https://dataverse.harvard.edu'

    namespaces = {}

    MAX_ITEMS_PER_REQUEST = 1000
    URL = 'https://dataverse.harvard.edu/api/search/?q=*'
    TYPE = 'dataset'

    schema = {
        'title':
        '/name',
        'description':
        '/description',
        'contributors': ('/authors', default_name_parser),
        'providerUpdatedDateTime': ('/published_at', date_formatter),
        'uris': {
            'canonicalUri': '/url',
            'objectUris': [('/image_url')]
        },
        'otherProperties':
        build_properties(('serviceID', '/global_id'), ('type', '/type'))
    }

    def harvest(self, start_date=None, end_date=None):
        start_date = (start_date or date.today() -
                      timedelta(settings.DAYS_BACK)).isoformat()
        end_date = (end_date or date.today()).isoformat()

        query = furl.furl(self.URL)
        query.args['type'] = self.TYPE
        query.args['per_page'] = self.MAX_ITEMS_PER_REQUEST
        query.args['key'] = HARVARD_DATAVERSE_API_KEY
        query.args['sort'] = 'date'
        query.args['order'] = 'asc'
        query.args['fq'] = 'dateSort:[{}T00:00:00Z TO {}T00:00:00Z]'.format(
            start_date, end_date)

        records = self.get_records(query.url)
        record_list = []
        for record in records:
            doc_id = record['global_id']

            record_list.append(
                RawDocument({
                    'doc': json.dumps(record),
                    'source': self.short_name,
                    'docID': doc_id,
                    'filetype': 'json'
                }))

        return record_list

    def get_records(self, search_url):
        records = requests.get(search_url)
        total_records = records.json()['data']['total_count']
        start = 0
        all_records = []

        while len(all_records) < total_records:
            records = requests.get(search_url + '&start={}'.format(str(start)))
            record_list = records.json()['data']['items']

            for record in record_list:
                all_records.append(record)

            start += self.MAX_ITEMS_PER_REQUEST

        return all_records
示例#30
0
class NSFAwards(JSONHarvester):
    short_name = 'nsfawards'
    long_name = 'NSF Awards'
    url = 'http://www.nsf.gov/'

    URL = 'http://api.nsf.gov/services/v1/awards.json?dateStart='

    schema = {
        'title':
        '/title',
        'contributors': ('/piFirstName', '/piLastName', '/awardeeName',
                         process_NSF_contributors),
        'providerUpdatedDateTime': ('/date', datetime_formatter),
        'uris': ('/id', process_nsf_uris),
        'sponsorships': ('/agency', '/id', '/title', process_sponsorships),
        'otherProperties':
        build_properties(
            ('awardeeCity', '/awardeeCity'),
            ('awardeeStateCode', '/awardeeStateCode'),
            ('fundsObligatedAmt', '/fundsObligatedAmt'),
            ('publicAccessMandate', '/publicAccessMandate'),
        )
    }

    def harvest(self, start_date=None, end_date=None):
        start_date = start_date if start_date else date.today() - timedelta(
            settings.DAYS_BACK)
        end_date = end_date - timedelta(
            1) if end_date else date.today() - timedelta(1)

        search_url = '{0}{1}&dateEnd={2}'.format(
            self.URL, start_date.strftime('%m/%d/%Y'),
            end_date.strftime('%m/%d/%Y'))

        records = self.get_records(search_url)

        record_list = []
        for record in records:
            doc_id = record['id']

            record_list.append(
                RawDocument({
                    'doc': json.dumps(record),
                    'source': self.short_name,
                    'docID': six.text_type(doc_id),
                    'filetype': 'json'
                }))

        return record_list

    def get_records(self, search_url):
        records = requests.get(search_url).json()['response'].get('award')
        offset = 1

        all_records = []
        while len(records) == 25:
            for record in records:
                all_records.append(record)

            offset += 25
            records = requests.get(search_url +
                                   '&offset={}'.format(str(offset)),
                                   throttle=3).json()['response'].get('award')
        all_records.extend(records)

        return all_records
示例#31
0
文件: __init__.py 项目: kms6bn/scrapi
 def formatted_properties(self):
     return {"otherProperties": build_properties(*list(map(self.format_property, self.property_list)))}
示例#32
0
class ELifeHarvester(XMLHarvester):
    short_name = 'elife'
    long_name = 'eLife Sciences'
    url = 'http://elifesciences.org/'
    DEFAULT_ENCODING = 'UTF-8'
    record_encoding = None

    namespaces = {}

    MAX_ROWS_PER_REQUEST = 999
    BASE_URL = 'https://api.github.com/repos/elifesciences/elife-article-xml/commits?'
    BASE_COMMIT_URL = 'https://api.github.com/repos/elifesciences/elife-article-xml/commits/{}'
    BASE_DATA_URL = 'https://raw.githubusercontent.com/elifesciences/elife-article-xml/master/{}'

    def harvest(self, start_date=None, end_date=None):
        start_date = start_date or datetime.date.today() - datetime.timedelta(settings.DAYS_BACK)
        end_date = end_date or datetime.date.today()

        shas = fetch_commits(self.BASE_URL, start_date.isoformat(), end_date.isoformat())

        files = list(set(chain.from_iterable([
            fetch_file_names(self.BASE_COMMIT_URL, sha)
            for sha in shas])))

        files = filter(lambda filename: filename.endswith('.xml'), files)

        xml_records = [
            fetch_xml(self.BASE_DATA_URL, filename)
            for filename in files
        ]

        return [
            RawDocument({
                'filetype': 'xml',
                'source': self.short_name,
                'doc': etree.tostring(record),
                'docID': record.xpath('//article-id[@*]')[0].text,
            }) for record in xml_records
        ]

    schema = {
        'uris': {
            'canonicalUri': ('//article-id/node()', compose('http://dx.doi.org/10.7554/eLife.{}'.format,
                                                            single_result)),
            'objectUri': ('//article-id/node()', compose('http://dx.doi.org/10.7554/eLife.{}'.format, single_result))
        },
        'contributors': ('//article-meta/contrib-group/contrib/name/*[not(self::suffix)]/node()', elife_name_parser),
        'providerUpdatedDateTime': ('//article-meta/pub-date[@publication-format="electronic"]/*/node()',
                                    compose(datetime_formatter, elife_date_parser)),
        'title': ('//article-meta/title-group/article-title//text()', collapse_list),
        'description': ('//abstract[not(@abstract-type="executive-summary")]/p[1]//text()', collapse_list),
        'publisher': {
            'name': ('//publisher-name/node()', single_result)
        },
        'subjects': '//article-meta/article-categories/descendant::text()',
        'freeToRead': {
            'startDate': ('//article-meta/pub-date[@publication-format="electronic"]/*/node()',
                          elife_date_parser)
        },
        'tags': '//kwd/text()',
        'otherProperties': build_properties(
                ('rights', ('//permissions/license/license-p/ext-link/text()', single_result))
        )
    }
示例#33
0
文件: utils.py 项目: erinspace/scrapi
    'description': 'This study seeks to understand how humans impact\
            the dietary patterns of eight free-ranging vervet monkey\
            (Chlorocebus pygerythrus) groups in South Africa using stable\
            isotope analysis.',
    'providerUpdatedDateTime': '2015-02-23T00:00:00',
    'shareProperties': {
        'source': 'test'
    }
}


TEST_SCHEMA = updated_schema(DOESCHEMA, {
    "title": ("//dc:title/node()", lambda x: "Title overwritten"),
    "otherProperties": build_properties(
        ("title1", ("//dc:title/node()", single_result)),
        ("title2", ("//dc:title/node()", lambda x: single_result(x).lower())),
        ("title3", ("//dc:title/node()", "//dc:title/node()", lambda x, y: single_result(x) + single_result(y).lower()))
    )
})


TEST_NAMESPACES = {
    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
    'dc': 'http://purl.org/dc/elements/1.1/',
    'dcq': 'http://purl.org/dc/terms/'
}


TEST_XML_DOC = b'''
    <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcq="http://purl.org/dc/terms/">
        <records count="97" morepages="true" start="1" end="10">
示例#34
0
class PlosHarvester(XMLHarvester):
    short_name = 'plos'
    long_name = 'Public Library of Science'
    url = 'http://www.plos.org/'

    namespaces = {}

    MAX_ROWS_PER_REQUEST = 999
    BASE_URL = 'http://api.plos.org/search'

    def fetch_rows(self, start_date, end_date):
        query = 'publication_date:[{}T00:00:00Z TO {}T00:00:00Z]'.format(
            start_date, end_date)

        resp = requests.get(self.BASE_URL,
                            params={
                                'q': query,
                                'rows': '0',
                                'api_key': PLOS_API_KEY,
                            })

        total_rows = etree.XML(resp.content).xpath('//result/@numFound')
        total_rows = int(total_rows[0]) if total_rows else 0

        current_row = 0
        while current_row < total_rows:
            response = requests.get(self.BASE_URL,
                                    throttle=5,
                                    params={
                                        'q': query,
                                        'start': current_row,
                                        'api_key': PLOS_API_KEY,
                                        'rows': self.MAX_ROWS_PER_REQUEST,
                                    })

            for doc in etree.XML(response.content).xpath('//doc'):
                yield doc

            current_row += self.MAX_ROWS_PER_REQUEST

    def harvest(self, start_date=None, end_date=None):

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        if not PLOS_API_KEY:
            return []

        return [
            RawDocument({
                'filetype': 'xml',
                'source': self.short_name,
                'doc': etree.tostring(row),
                'docID': row.xpath("str[@name='id']")[0].text,
            }) for row in self.fetch_rows(start_date.isoformat(),
                                          end_date.isoformat())
            if row.xpath("arr[@name='abstract']")
            or row.xpath("str[@name='author_display']")
        ]

    schema = {
        'uris': {
            'canonicalUri': ('//str[@name="id"]/node()',
                             compose('http://dx.doi.org/{}'.format,
                                     single_result)),
        },
        'contributors':
        ('//arr[@name="author_display"]/str/node()', default_name_parser),
        'providerUpdatedDateTime':
        ('//date[@name="publication_data"]/node()',
         compose(lambda x: parse(x).date().isoformat(), single_result)),
        'title': ('//str[@name="title_display"]/node()', single_result),
        'description': ('//arr[@name="abstract"]/str/node()', single_result),
        'publisher': {
            'name': ('//str[@name="journal"]/node()', single_result)
        },
        'otherProperties':
        build_properties(('eissn', '//str[@name="eissn"]/node()'),
                         ('articleType', '//str[@name="article_type"]/node()'),
                         ('score', '//float[@name="score"]/node()'))
    }
示例#35
0
            isotope analysis.',
    'providerUpdatedDateTime':
    '2015-02-23T00:00:00',
    'shareProperties': {
        'source': 'test'
    }
}

TEST_SCHEMA = updated_schema(
    DOESCHEMA, {
        "title": ("//dc:title/node()", lambda x: "Title overwritten"),
        "otherProperties":
        build_properties(
            ("title1", ("//dc:title/node()", single_result)),
            ("title2",
             ("//dc:title/node()", lambda x: single_result(x).lower())),
            ("title3",
             ("//dc:title/node()", "//dc:title/node()",
              lambda x, y: single_result(x) + single_result(y).lower())))
    })

TEST_NAMESPACES = {
    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
    'dc': 'http://purl.org/dc/elements/1.1/',
    'dcq': 'http://purl.org/dc/terms/'
}

TEST_XML_DOC = '''
    <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcq="http://purl.org/dc/terms/">
        <records count="97" morepages="true" start="1" end="10">
            <record rownumber="1">
示例#36
0
class FigshareHarvester(JSONHarvester):
    short_name = 'figshare'
    long_name = 'figshare'
    url = 'http://figshare.com/'

    URL = 'http://api.figshare.com/v1/articles/search?search_for=*&from_date='

    schema = {
        'title': '/title',
        'description': '/description',
        'contributors': ('/authors', lambda x: default_name_parser([person['author_name'] for person in x])),
        'providerUpdatedDateTime': ('/modified_date', date_formatter),
        'uris': {
            'canonicalUri': ('/DOI', lambda x: x[0] if isinstance(x, list) else x),
            'providerUris': [
                ('/url')
            ]
        },
        'otherProperties': build_properties(
            ('serviceID', ('/article_id', str)),
            ('definedType', '/defined_type'),
            ('type', '/type'),
            ('links', '/links'),
            ('publishedDate', '/published_date')
        )
    }

    def harvest(self, start_date=None, end_date=None):
        """ Figshare should always have a 24 hour delay because they
        manually go through and check for test projects. Most of them
        are removed within 24 hours.

        So, we will shift everything back a day with harvesting to ensure
        nothing is harvested on the day of.
        """
        start_date = start_date - timedelta(1) if start_date else date.today() - timedelta(1 + settings.DAYS_BACK)
        end_date = end_date - timedelta(1) if end_date else date.today() - timedelta(1)

        search_url = '{0}{1}&to_date={2}'.format(
            self.URL,
            start_date.isoformat(),
            end_date.isoformat()
        )

        records = self.get_records(search_url)

        record_list = []
        for record in records:
            doc_id = record['article_id']

            record_list.append(
                RawDocument(
                    {
                        'doc': json.dumps(record),
                        'source': self.short_name,
                        'docID': six.text_type(doc_id),
                        'filetype': 'json'
                    }
                )
            )

        return record_list

    def get_records(self, search_url):
        records = requests.get(search_url)
        total_records = records.json()['items_found']
        page = 1

        all_records = []
        while len(all_records) < total_records:
            record_list = records.json()['items']

            for record in record_list:
                if len(all_records) < total_records:
                    all_records.append(record)

            page += 1
            records = requests.get(search_url + '&page={}'.format(str(page)), throttle=3)

        return all_records
示例#37
0
 def formatted_properties(self):
     return {
         'otherProperties':
         build_properties(
             *list(map(self.format_property, self.property_list)))
     }
示例#38
0
 def formatted_properties(self):
     return {
         'otherProperties': build_properties(*map(self.format_property, self.property_list))
     }
示例#39
0
class DataOneHarvester(XMLHarvester):
    short_name = 'dataone'
    long_name = 'DataONE: Data Observation Network for Earth'
    url = 'https://www.dataone.org/'

    namespaces = {}

    record_encoding = None

    schema = {
        'otherProperties':
        build_properties(
            ('authorGivenName', ("str[@name='authorGivenName']/node()")),
            ('authorSurName', ("str[@name='authorSurName']/node()")),
            ('authoritativeMN', ("str[@name='authoritativeMN']/node()")),
            ('checksum', ("str[@name='checksum']/node()")),
            ('checksumAlgorithm', ("str[@name='checksumAlgorithm']/node()")),
            ('datasource', ("str[@name='datasource']/node()")),
            ('datePublished', ("date[@name='datePublished']/node()")),
            ('dateUploaded', ("date[@name='dateUploaded']/node()")),
            ('pubDate', ("date[@name='pubDate']/node()")),
            ('updateDate', ("date[@name='updateDate']/node()")),
            ('fileID', ("str[@name='fileID']/node()")),
            ('formatId', ("str[@name='formatId']/node()")),
            ('formatType', ("str[@name='formatType']/node()")),
            ('identifier', ("str[@name='identifier']/node()")),
            ('readPermission', "arr[@name='readPermission']/str/node()"),
            ('replicaMN', "arr[@name='replicaMN']/str/node()"),
            ('replicaVerifiedDate',
             "arr[@name='replicaVerifiedDate']/date/node()"),
            ('replicationAllowed',
             ("bool[@name='replicationAllowed']/node()")),
            ('numberReplicas', ("int[@name='numberReplicas']/node()")),
            ('preferredReplicationMN',
             "arr[@name='preferredReplicationMN']/str/node()"),
            ('rightsHolder', ("str[@name='rightsHolder']/node()")),
            ('scientificName', "arr[@name='scientificName']/str/node()"),
            ('site', "arr[@name='site']/str/node()"),
            ('size', ("long[@name='size']/node()")),
            ('isDocumentedBy', "arr[@name='isDocumentedBy']/str/node()"),
            ('serviceID',
             "str[@name='id']/node()"), ('sku', "str[@name='sku']/node()")),
        'freeToRead': {
            'startDate': ("bool[@name='isPublic']/node()",
                          "date[@name='dateModified']/node()",
                          lambda x, y: parse(y[0]).date().isoformat()
                          if x else None)
        },
        'contributors':
        ("str[@name='author']/node()", "str[@name='submitter']/node()",
         "arr[@name='origin']/str/node()",
         "arr[@name='investigator']/str/node()", process_contributors),
        'uris': ("str[@name='id']/node()", "//str[@name='dataUrl']/node()",
                 "arr[@name='resourceMap']/str/node()",
                 partial(helpers.oai_process_uris, use_doi=True)),
        'tags': ("//arr[@name='keywords']/str/node()", lambda x: x
                 if isinstance(x, list) else [x]),
        'providerUpdatedDateTime': ("str[@name='dateModified']/node()",
                                    compose(datetime_formatter,
                                            single_result)),
        'title': ("str[@name='title']/node()", single_result),
        'description': ("str[@name='abstract']/node()", single_result)
    }

    def harvest(self, start_date=None, end_date=None):

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        records = self.get_records(start_date, end_date)

        xml_list = []
        for record in records:
            # This ID is unique per data package, but won't unify multiple packages for the same project
            doc_id = record.xpath("str[@name='id']")[0].text
            format_type = record.xpath("str[@name='formatType']")[0].text
            record = ElementTree.tostring(record,
                                          encoding=self.record_encoding)
            if format_type.lower() != 'metadata':
                logger.info(
                    'Not normalizing record with ID {}, type {}'.format(
                        doc_id, format_type))
            else:
                xml_list.append(
                    RawDocument({
                        'doc': record,
                        'source': self.short_name,
                        'docID': copy_to_unicode(doc_id),
                        'filetype': 'xml'
                    }))

        return xml_list

    def get_records(self, start_date, end_date):
        ''' helper function to get a response from the DataONE
        API, with the specified number of rows.
        Returns an etree element with results '''

        query = 'dateModified:[{}T00:00:00Z TO {}T00:00:00Z]'.format(
            start_date.isoformat(), end_date.isoformat())
        doc = requests.get(DATAONE_SOLR_ENDPOINT,
                           params={
                               'q': query,
                               'start': 0,
                               'rows': 1
                           })
        doc = etree.XML(doc.content)
        rows = int(doc.xpath("//result/@numFound")[0])

        n = 0
        while n < rows:
            data = requests.get(DATAONE_SOLR_ENDPOINT,
                                params={
                                    'q': query,
                                    'start': n,
                                    'rows': 1000
                                })
            docs = etree.XML(data.content).xpath('//doc')
            for doc in docs:
                yield doc
            n += 1000