def test_failing_transformation_with_raises(self): base.settings.RAISE_IN_TRANSFORMER = True self.harvester.schema = updated_schema(TEST_SCHEMA, {'title': 'A completely 1n\/@lid expre55ion'}) with pytest.raises(XPathEvalError) as e: x = [self.harvester.normalize(record) for record in self.harvester.harvest()]
def test_arg_kwargs(self): def process_title(title, title1="test"): return title[0] + (title1[0] if isinstance(title1, list) else title1) def process_title2(title1="test"): return title1[0] if isinstance(title1, list) else title1 args = ("//dc:title/node()", ) kwargs = {"title1": "//dc:title/node()"} self.harvester.schema = updated_schema( TEST_SCHEMA, { 'title': (pack(*args, **kwargs), process_title), 'otherProperties': build_properties( ('title2', (pack(*args), process_title)), ('title3', (pack(**kwargs), process_title2)), ('title4', (pack('//dc:title/node()', title1='//dc:title/node()'), process_title)) ) } ) results = [self.harvester.normalize(record) for record in self.harvester.harvest(days_back=1)] for result in results: assert result['title'] == "TestTest" assert result['otherProperties'][0]['properties']['title2'] == 'Testtest' assert result['otherProperties'][1]['properties']['title3'] == 'Test' assert result['otherProperties'][2]['properties']['title4'] == "TestTest"
def schema(self): return helpers.updated_schema( self._schema, { "uris": ('//ns0:header/ns0:identifier/node()', '//dc:identifier/node()', oai_process_uris_addis_ababa) })
def schema(self): return helpers.updated_schema(self._schema, { "uris": { "canonicalUri": ('//dc:identifier/node()', helpers.compose(create_icpsr_url, helpers.single_result)), "objectUris": [('//dc:identifier/node()', icpsr_exttract_doi)] } })
def schema(self): return updated_schema( self._schema, { 'contributors': ('//dc:creator/node()', '//dc:contributor/node()', aoi_process_contributors_bhl) })
class DryadHarvester(OAIHarvester): short_name = 'dryad' long_name = 'Dryad Data Repository' url = 'http://www.datadryad.org/oai/request' base_url = 'http://www.datadryad.org/oai/request' property_list = ['rights', 'format', 'relation', 'date', 'identifier', 'type', 'setSpec'] timezone_granularity = True schema = helpers.updated_schema( schemas.OAISCHEMA, { "uris": { "objectUris": ('//dc:relation/node()', '//dc:identifier/node()', format_dois_dryad) } } ) def normalize(self, raw_doc): result = etree.XML(raw_doc['doc']) status = (result.xpath('//dc:status/node()', namespaces=self.namespaces) or [''])[0] if str(status).lower() in ['deleted', 'item is not available']: logger.info('Not normalizing record with ID {}, status {}'.format(raw_doc['docID'], status)) return None doc_type = (result.xpath('//dc:type/node()', namespaces=self.namespaces) or [''])[0] if not doc_type.lower() == 'article': logger.info('Not normalizing record with ID {}, type {}'.format(raw_doc['docID'], doc_type)) return None return super(OAIHarvester, self).normalize(raw_doc)
def schema(self): return updated_schema(self._schema, { "description": ("//dc:description/node()", get_second_description), "uris": { "canonicalUri": ('//dc:identifier/node()', compose(single_result, oai_extract_dois)), "objectUris": ('//dc:identifier/node()', oai_extract_dois) } })
def test_failing_transformation_wont_raise(self): base.transformer.logger.setLevel(50) base.settings.RAISE_IN_TRANSFORMER = False self.harvester.schema = updated_schema(TEST_SCHEMA, {'title': 'A completely 1n\/@lid expre55ion'}) with pytest.raises(ValidationError) as e: x = [self.harvester.normalize(record) for record in self.harvester.harvest()]
def schema(self): return helpers.updated_schema( self._schema, { "uris": { "objectUris": ('//dc:relation/node()', '//dc:identifier/node()', format_dois_dryad) } })
def schema(self): return helpers.updated_schema( self._schema, { "uris": { "canonicalUri": ('//ns0:header/ns0:identifier/node()', helpers.compose(oai_extract_url_pubmedcentral, helpers.single_result)) } })
def schema(self): properties = { 'otherProperties': build_properties(*[(item, ('//dc:{}/node()'.format(item), '//ns0:{}/node()'.format(item), self.resolve_property)) for item in self.property_list]) } return updated_schema(OAISCHEMA, properties)
def schema(self): properties = { 'otherProperties': build_properties(*[(item, ( '//dc:{}/node()'.format(item), '//ns0:{}/node()'.format(item), self.resolve_property) ) for item in self.property_list]) } return updated_schema(OAISCHEMA, properties)
def schema(self): return updated_schema( self._schema, { "description": ("//dc:description/node()", get_second_description), "uris": { "canonicalUri": ('//dc:identifier/node()', compose(single_result, oai_extract_dois)), "objectUris": ('//dc:identifier/node()', oai_extract_dois) } })
def test_failing_transformation_with_raises(self): base.settings.RAISE_IN_TRANSFORMER = True self.harvester.schema = updated_schema( TEST_SCHEMA, {'title': 'A completely 1n\/@lid expre55ion'}) with pytest.raises(XPathEvalError) as e: x = [ self.harvester.normalize(record) for record in self.harvester.harvest() ]
def test_failing_transformation_wont_raise(self): base.transformer.logger.setLevel(50) base.settings.RAISE_IN_TRANSFORMER = False self.harvester.schema = updated_schema( TEST_SCHEMA, {'title': 'A completely 1n\/@lid expre55ion'}) with pytest.raises(ValidationError) as e: x = [ self.harvester.normalize(record) for record in self.harvester.harvest() ]
def schema(self): return helpers.updated_schema( self._schema, { "uris": { "canonicalUri": ('//dc:identifier/node()', helpers.compose(create_icpsr_url, helpers.single_result)), "objectUris": [ ('//dc:identifier/node()', icpsr_exttract_doi) ] } })
class BHLHarvester(OAIHarvester): short_name = 'bhl' long_name = 'Biodiversity Heritage Library OAI Repository' url = 'http://www.biodiversitylibrary.org/' base_url = 'http://www.biodiversitylibrary.org/oai' schema = updated_schema( OAISCHEMA, { 'contributors': ('//dc:creator/node()', '//dc:contributor/node()', aoi_process_contributors_bhl) }) property_list = ['type', 'date', 'relation', 'setSpec', 'rights']
class ScholarsbankHarvester(OAIHarvester): short_name = 'scholarsbank' long_name = 'Scholars Bank University of Oregon' url = 'http://scholarsbank.uoregon.edu' timezone_granularity = True base_url = 'http://scholarsbank.uoregon.edu/oai/request' property_list = [ 'type', 'source', 'format', 'relation', 'date', 'description', 'setSpec', 'identifier' ] schema = updated_schema( OAISCHEMA, {'description': ('//dc:description/node()', second_result)})
class PubMedCentralHarvester(OAIHarvester): short_name = 'pubmedcentral' long_name = 'PubMed Central' url = 'http://www.ncbi.nlm.nih.gov/pmc/' schema = helpers.updated_schema( schemas.OAISCHEMA, { "uris": { "canonicalUri": ('//ns0:header/ns0:identifier/node()', helpers.compose(oai_extract_url_pubmedcentral, helpers.single_result)) } }) base_url = 'http://www.pubmedcentral.nih.gov/oai/oai.cgi' property_list = [ 'type', 'source', 'rights', 'format', 'setSpec', 'date', 'identifier' ]
def test_constants(self): self.harvester.schema = updated_schema( TEST_SCHEMA, { 'tags': (CONSTANT(['X']), lambda x: x), 'otherProperties': [{ 'name': CONSTANT('test'), 'properties':{ 'test': CONSTANT('test') }, 'uri': CONSTANT('http://example.com'), 'description': CONSTANT('A test field') }] } ) results = [ self.harvester.normalize(record) for record in self.harvester.harvest(days_back=1) ] for result in results: assert result['otherProperties'][0]['properties']['test'] == 'test' assert result['tags'] == ['X']
def test_constants(self): self.harvester.schema = updated_schema( TEST_SCHEMA, { 'tags': (CONSTANT(['X']), lambda x: x), 'otherProperties': [{ 'name': CONSTANT('test'), 'properties': { 'test': CONSTANT('test') }, 'uri': CONSTANT('http://example.com'), 'description': CONSTANT('A test field') }] }) results = [ self.harvester.normalize(record) for record in self.harvester.harvest(days_back=1) ] for result in results: assert result['otherProperties'][0]['properties']['test'] == 'test' assert result['tags'] == ['X']
def test_arg_kwargs(self): def process_title(title, title1="test"): return title[0] + (title1[0] if isinstance(title1, list) else title1) def process_title2(title1="test"): return title1[0] if isinstance(title1, list) else title1 args = ("//dc:title/node()", ) kwargs = {"title1": "//dc:title/node()"} self.harvester.schema = updated_schema( TEST_SCHEMA, { 'title': (pack(*args, **kwargs), process_title), 'otherProperties': build_properties( ('title2', (pack(*args), process_title)), ('title3', (pack(**kwargs), process_title2)), ('title4', (pack('//dc:title/node()', title1='//dc:title/node()'), process_title))) }) results = [ self.harvester.normalize(record) for record in self.harvester.harvest(days_back=1) ] for result in results: assert result['title'] == "TestTest" assert result['otherProperties'][0]['properties'][ 'title2'] == 'Testtest' assert result['otherProperties'][1]['properties'][ 'title3'] == 'Test' assert result['otherProperties'][2]['properties'][ 'title4'] == "TestTest"
def _schema(self): return updated_schema(OAISCHEMA, self.formatted_properties)
}], 'description': 'This study seeks to understand how humans impact\ the dietary patterns of eight free-ranging vervet monkey\ (Chlorocebus pygerythrus) groups in South Africa using stable\ isotope analysis.', 'providerUpdatedDateTime': '2015-02-23T00:00:00', 'shareProperties': { 'source': 'test' } } TEST_SCHEMA = updated_schema(DOESCHEMA, { "title": ("//dc:title/node()", lambda x: "Title overwritten"), "otherProperties": build_properties( ("title1", ("//dc:title/node()", single_result)), ("title2", ("//dc:title/node()", lambda x: single_result(x).lower())), ("title3", ("//dc:title/node()", "//dc:title/node()", lambda x, y: single_result(x) + single_result(y).lower())) ) }) TEST_NAMESPACES = { 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'dc': 'http://purl.org/dc/elements/1.1/', 'dcq': 'http://purl.org/dc/terms/' } TEST_XML_DOC = b''' <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcq="http://purl.org/dc/terms/"> <records count="97" morepages="true" start="1" end="10">
def schema(self): return helpers.updated_schema(self._schema, { "uris": { "canonicalUri": ('//ns0:header/ns0:identifier/node()', helpers.compose(oai_extract_url_pubmedcentral, helpers.single_result)) } })
def schema(self): return updated_schema(self._schema, { 'contributors': ('//dc:creator/node()', '//dc:contributor/node()', aoi_process_contributors_bhl) })
def schema(self): return helpers.updated_schema( self._schema, { "uris": ('//dc:identifier/node()', '//dc:relation/node()', helpers.oai_process_uris) })
the dietary patterns of eight free-ranging vervet monkey\ (Chlorocebus pygerythrus) groups in South Africa using stable\ isotope analysis.', 'providerUpdatedDateTime': '2015-02-23T00:00:00', 'shareProperties': { 'source': 'test' } } TEST_SCHEMA = updated_schema( DOESCHEMA, { "title": ("//dc:title/node()", lambda x: "Title overwritten"), "otherProperties": build_properties( ("title1", ("//dc:title/node()", single_result)), ("title2", ("//dc:title/node()", lambda x: single_result(x).lower())), ("title3", ("//dc:title/node()", "//dc:title/node()", lambda x, y: single_result(x) + single_result(y).lower()))) }) TEST_NAMESPACES = { 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'dc': 'http://purl.org/dc/elements/1.1/', 'dcq': 'http://purl.org/dc/terms/' } TEST_XML_DOC = ''' <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcq="http://purl.org/dc/terms/"> <records count="97" morepages="true" start="1" end="10">
def schema(self): return helpers.updated_schema(self._schema, { "uris": ('//dc:identifier/node()', helpers.oai_process_uris) })
def schema(self): return updated_schema(self._schema, { 'languages': ('//dc:language/node()', umontreal_language_processor) })
# }, 'description': 'This study seeks to understand how humans impact\ the dietary patterns of eight free-ranging vervet monkey\ (Chlorocebus pygerythrus) groups in South Africa using stable\ isotope analysis.', 'providerUpdatedDateTime': '2015-02-23T00:00:00', 'shareProperties': { 'source': 'crossref' } } TEST_SCHEMA = updated_schema(BASEXMLSCHEMA, { "title": ("//dc:title/node()", lambda x: "Title overwritten"), # "otherProperties": { # "title1": "//dc:title/node()", # "title2": ["//dc:title/node()", lambda x: x.lower()], # "title3": ["//dc:title/node()", "//dc:title/node()", lambda x, y: x + y.lower()] # } }) def get_leaves(d, leaves=None): if leaves is None: leaves = [] for k, v in d.items(): if isinstance(v, dict): leaves.extend(get_leaves(v, leaves)) else: leaves.append((k, v))
def schema(self): return helpers.updated_schema(self._schema, { "uris": ('//ns0:header/ns0:identifier/node()', '//dc:identifier/node()', oai_process_pcurio) })
def schema(self): return helpers.updated_schema( self._schema, { "uris": ('//ns0:header/ns0:identifier/node()', '//dc:identifier/node()', format_uris_pubmedcentral) })
def schema(self): return helpers.updated_schema( self._schema, {'description': ('//dc:description/node()', second_result)})
def schema(self): return helpers.updated_schema( self._schema, {"uris": ("//dc:identifier/node()", "//dc:relation/node()", helpers.oai_process_uris)} )
def schema(self): return helpers.updated_schema(self._schema, { "uris": ('//ns0:header/ns0:identifier/node()', '//dc:identifier/node()', format_uris_pubmedcentral) })
def schema(self): return helpers.updated_schema(self._schema, { "uris": { "objectUris": ('//dc:relation/node()', '//dc:identifier/node()', format_dois_dryad) } })