Python updated_schema示例，scrapi.base.helpers.updated_schema Python示例

示例#1

0

显示文件

文件： test_transformer.py 项目： Johnetordoff/scrapi

    def test_failing_transformation_with_raises(self):
        base.settings.RAISE_IN_TRANSFORMER = True

        self.harvester.schema = updated_schema(TEST_SCHEMA, {'title': 'A completely 1n\/@lid expre55ion'})

        with pytest.raises(XPathEvalError) as e:
            x = [self.harvester.normalize(record) for record in self.harvester.harvest()]

示例#2

0

显示文件

文件： test_transformer.py 项目： Johnetordoff/scrapi

    def test_arg_kwargs(self):
        def process_title(title, title1="test"):
            return title[0] + (title1[0] if isinstance(title1, list) else title1)

        def process_title2(title1="test"):
            return title1[0] if isinstance(title1, list) else title1

        args = ("//dc:title/node()", )
        kwargs = {"title1": "//dc:title/node()"}

        self.harvester.schema = updated_schema(
            TEST_SCHEMA,
            {
                'title': (pack(*args, **kwargs), process_title),
                'otherProperties': build_properties(
                    ('title2', (pack(*args), process_title)),
                    ('title3', (pack(**kwargs), process_title2)),
                    ('title4', (pack('//dc:title/node()', title1='//dc:title/node()'), process_title))
                )
            }
        )


        results = [self.harvester.normalize(record) for record in self.harvester.harvest(days_back=1)]

        for result in results:
            assert result['title'] == "TestTest"
            assert result['otherProperties'][0]['properties']['title2'] == 'Testtest'
            assert result['otherProperties'][1]['properties']['title3'] == 'Test'
            assert result['otherProperties'][2]['properties']['title4'] == "TestTest"

示例#3

0

显示文件

 def schema(self):
     return helpers.updated_schema(
         self._schema, {
             "uris":
             ('//ns0:header/ns0:identifier/node()',
              '//dc:identifier/node()', oai_process_uris_addis_ababa)
         })

示例#4

0

显示文件

文件： icpsr.py 项目： AndrewSallans/scrapi

 def schema(self):
     return helpers.updated_schema(self._schema, {
         "uris": {
             "canonicalUri": ('//dc:identifier/node()', helpers.compose(create_icpsr_url, helpers.single_result)),
             "objectUris": [('//dc:identifier/node()', icpsr_exttract_doi)]
         }
     })

示例#5

0

显示文件

 def schema(self):
     return updated_schema(
         self._schema, {
             'contributors':
             ('//dc:creator/node()', '//dc:contributor/node()',
              aoi_process_contributors_bhl)
         })

示例#6

0

显示文件

class DryadHarvester(OAIHarvester):
    short_name = 'dryad'
    long_name = 'Dryad Data Repository'
    url = 'http://www.datadryad.org/oai/request'

    base_url = 'http://www.datadryad.org/oai/request'
    property_list = ['rights', 'format', 'relation', 'date',
                     'identifier', 'type', 'setSpec']
    timezone_granularity = True

    schema = helpers.updated_schema(
        schemas.OAISCHEMA,
        {
            "uris": {
                "objectUris": ('//dc:relation/node()', '//dc:identifier/node()', format_dois_dryad)
            }
        }
    )

    def normalize(self, raw_doc):
        result = etree.XML(raw_doc['doc'])

        status = (result.xpath('//dc:status/node()', namespaces=self.namespaces) or [''])[0]
        if str(status).lower() in ['deleted', 'item is not available']:
            logger.info('Not normalizing record with ID {}, status {}'.format(raw_doc['docID'], status))
            return None
        doc_type = (result.xpath('//dc:type/node()', namespaces=self.namespaces) or [''])[0]
        if not doc_type.lower() == 'article':
            logger.info('Not normalizing record with ID {}, type {}'.format(raw_doc['docID'], doc_type))
            return None

        return super(OAIHarvester, self).normalize(raw_doc)

示例#7

0

显示文件

文件： datacite.py 项目： NeuroVault/scrapi

 def schema(self):
     return updated_schema(self._schema, {
         "description": ("//dc:description/node()", get_second_description),
         "uris": {
             "canonicalUri": ('//dc:identifier/node()', compose(single_result, oai_extract_dois)),
             "objectUris": ('//dc:identifier/node()', oai_extract_dois)
         }
     })

示例#8

0

显示文件

文件： test_transformer.py 项目： Johnetordoff/scrapi

    def test_failing_transformation_wont_raise(self):
        base.transformer.logger.setLevel(50)
        base.settings.RAISE_IN_TRANSFORMER = False

        self.harvester.schema = updated_schema(TEST_SCHEMA, {'title': 'A completely 1n\/@lid expre55ion'})

        with pytest.raises(ValidationError) as e:
            x = [self.harvester.normalize(record) for record in self.harvester.harvest()]

示例#9

0

显示文件

文件： dryad.py 项目： felliott/scrapi

 def schema(self):
     return helpers.updated_schema(
         self._schema, {
             "uris": {
                 "objectUris": ('//dc:relation/node()',
                                '//dc:identifier/node()', format_dois_dryad)
             }
         })

示例#10

0

显示文件

 def schema(self):
     return helpers.updated_schema(
         self._schema, {
             "uris": {
                 "canonicalUri":
                 ('//ns0:header/ns0:identifier/node()',
                  helpers.compose(oai_extract_url_pubmedcentral,
                                  helpers.single_result))
             }
         })

示例#11

0

显示文件

文件： __init__.py 项目： jeffreyliu3230/scrapi

    def schema(self):
        properties = {
            'otherProperties':
            build_properties(*[(item, ('//dc:{}/node()'.format(item),
                                       '//ns0:{}/node()'.format(item),
                                       self.resolve_property))
                               for item in self.property_list])
        }

        return updated_schema(OAISCHEMA, properties)

示例#12

0

显示文件

文件： __init__.py 项目： bdyetton/scrapi

    def schema(self):
        properties = {
            'otherProperties': build_properties(*[(item, (
                '//dc:{}/node()'.format(item),
                '//ns0:{}/node()'.format(item),
                self.resolve_property)
            ) for item in self.property_list])
        }

        return updated_schema(OAISCHEMA, properties)

示例#13

0

显示文件

 def schema(self):
     return updated_schema(
         self._schema, {
             "description":
             ("//dc:description/node()", get_second_description),
             "uris": {
                 "canonicalUri": ('//dc:identifier/node()',
                                  compose(single_result, oai_extract_dois)),
                 "objectUris": ('//dc:identifier/node()', oai_extract_dois)
             }
         })

示例#14

0

显示文件

    def test_failing_transformation_with_raises(self):
        base.settings.RAISE_IN_TRANSFORMER = True

        self.harvester.schema = updated_schema(
            TEST_SCHEMA, {'title': 'A completely 1n\/@lid expre55ion'})

        with pytest.raises(XPathEvalError) as e:
            x = [
                self.harvester.normalize(record)
                for record in self.harvester.harvest()
            ]

示例#15

0

显示文件

    def test_failing_transformation_wont_raise(self):
        base.transformer.logger.setLevel(50)
        base.settings.RAISE_IN_TRANSFORMER = False

        self.harvester.schema = updated_schema(
            TEST_SCHEMA, {'title': 'A completely 1n\/@lid expre55ion'})

        with pytest.raises(ValidationError) as e:
            x = [
                self.harvester.normalize(record)
                for record in self.harvester.harvest()
            ]

示例#16

0

显示文件

文件： icpsr.py 项目： zamattiac/scrapi

 def schema(self):
     return helpers.updated_schema(
         self._schema, {
             "uris": {
                 "canonicalUri":
                 ('//dc:identifier/node()',
                  helpers.compose(create_icpsr_url, helpers.single_result)),
                 "objectUris": [
                     ('//dc:identifier/node()', icpsr_exttract_doi)
                 ]
             }
         })

示例#17

0

显示文件

文件： bhl.py 项目： jeffreyliu3230/scrapi

class BHLHarvester(OAIHarvester):
    short_name = 'bhl'
    long_name = 'Biodiversity Heritage Library OAI Repository'
    url = 'http://www.biodiversitylibrary.org/'

    base_url = 'http://www.biodiversitylibrary.org/oai'
    schema = updated_schema(
        OAISCHEMA, {
            'contributors': ('//dc:creator/node()', '//dc:contributor/node()',
                             aoi_process_contributors_bhl)
        })
    property_list = ['type', 'date', 'relation', 'setSpec', 'rights']

示例#18

0

显示文件

class ScholarsbankHarvester(OAIHarvester):
    short_name = 'scholarsbank'
    long_name = 'Scholars Bank University of Oregon'
    url = 'http://scholarsbank.uoregon.edu'
    timezone_granularity = True

    base_url = 'http://scholarsbank.uoregon.edu/oai/request'
    property_list = [
        'type', 'source', 'format', 'relation', 'date', 'description',
        'setSpec', 'identifier'
    ]

    schema = updated_schema(
        OAISCHEMA, {'description': ('//dc:description/node()', second_result)})

示例#19

0

显示文件

文件： pubmedcentral.py 项目： jeffreyliu3230/scrapi

class PubMedCentralHarvester(OAIHarvester):
    short_name = 'pubmedcentral'
    long_name = 'PubMed Central'
    url = 'http://www.ncbi.nlm.nih.gov/pmc/'

    schema = helpers.updated_schema(
        schemas.OAISCHEMA, {
            "uris": {
                "canonicalUri": ('//ns0:header/ns0:identifier/node()',
                                 helpers.compose(oai_extract_url_pubmedcentral,
                                                 helpers.single_result))
            }
        })

    base_url = 'http://www.pubmedcentral.nih.gov/oai/oai.cgi'
    property_list = [
        'type', 'source', 'rights', 'format', 'setSpec', 'date', 'identifier'
    ]

示例#20

0

显示文件

文件： test_transformer.py 项目： Johnetordoff/scrapi

    def test_constants(self):
        self.harvester.schema = updated_schema(
            TEST_SCHEMA, {
                'tags': (CONSTANT(['X']), lambda x: x),
                'otherProperties': [{
                    'name': CONSTANT('test'),
                    'properties':{
                        'test':  CONSTANT('test')
                    },
                    'uri': CONSTANT('http://example.com'),
                    'description': CONSTANT('A test field')
                }]
            }
        )
        results = [
            self.harvester.normalize(record) for record in self.harvester.harvest(days_back=1)
        ]

        for result in results:
            assert result['otherProperties'][0]['properties']['test'] == 'test'
            assert result['tags'] == ['X']

示例#21

0

显示文件

    def test_constants(self):
        self.harvester.schema = updated_schema(
            TEST_SCHEMA, {
                'tags': (CONSTANT(['X']), lambda x: x),
                'otherProperties': [{
                    'name': CONSTANT('test'),
                    'properties': {
                        'test': CONSTANT('test')
                    },
                    'uri': CONSTANT('http://example.com'),
                    'description': CONSTANT('A test field')
                }]
            })
        results = [
            self.harvester.normalize(record)
            for record in self.harvester.harvest(days_back=1)
        ]

        for result in results:
            assert result['otherProperties'][0]['properties']['test'] == 'test'
            assert result['tags'] == ['X']

示例#22

0

显示文件

    def test_arg_kwargs(self):
        def process_title(title, title1="test"):
            return title[0] + (title1[0]
                               if isinstance(title1, list) else title1)

        def process_title2(title1="test"):
            return title1[0] if isinstance(title1, list) else title1

        args = ("//dc:title/node()", )
        kwargs = {"title1": "//dc:title/node()"}

        self.harvester.schema = updated_schema(
            TEST_SCHEMA, {
                'title': (pack(*args, **kwargs), process_title),
                'otherProperties':
                build_properties(
                    ('title2', (pack(*args), process_title)),
                    ('title3', (pack(**kwargs), process_title2)),
                    ('title4',
                     (pack('//dc:title/node()',
                           title1='//dc:title/node()'), process_title)))
            })

        results = [
            self.harvester.normalize(record)
            for record in self.harvester.harvest(days_back=1)
        ]

        for result in results:
            assert result['title'] == "TestTest"
            assert result['otherProperties'][0]['properties'][
                'title2'] == 'Testtest'
            assert result['otherProperties'][1]['properties'][
                'title3'] == 'Test'
            assert result['otherProperties'][2]['properties'][
                'title4'] == "TestTest"

示例#23

0

显示文件

文件： __init__.py 项目： NeuroVault/scrapi

 def _schema(self):
     return updated_schema(OAISCHEMA, self.formatted_properties)

示例#24

0

显示文件

文件： utils.py 项目： erinspace/scrapi

    }],
    'description': 'This study seeks to understand how humans impact\
            the dietary patterns of eight free-ranging vervet monkey\
            (Chlorocebus pygerythrus) groups in South Africa using stable\
            isotope analysis.',
    'providerUpdatedDateTime': '2015-02-23T00:00:00',
    'shareProperties': {
        'source': 'test'
    }
}


TEST_SCHEMA = updated_schema(DOESCHEMA, {
    "title": ("//dc:title/node()", lambda x: "Title overwritten"),
    "otherProperties": build_properties(
        ("title1", ("//dc:title/node()", single_result)),
        ("title2", ("//dc:title/node()", lambda x: single_result(x).lower())),
        ("title3", ("//dc:title/node()", "//dc:title/node()", lambda x, y: single_result(x) + single_result(y).lower()))
    )
})


TEST_NAMESPACES = {
    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
    'dc': 'http://purl.org/dc/elements/1.1/',
    'dcq': 'http://purl.org/dc/terms/'
}


TEST_XML_DOC = b'''
    <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcq="http://purl.org/dc/terms/">
        <records count="97" morepages="true" start="1" end="10">

示例#25

0

显示文件

文件： pubmedcentral.py 项目： erinspace/scrapi

 def schema(self):
     return helpers.updated_schema(self._schema, {
         "uris": {
             "canonicalUri": ('//ns0:header/ns0:identifier/node()', helpers.compose(oai_extract_url_pubmedcentral, helpers.single_result))
         }
     })

示例#26

0

显示文件

文件： bhl.py 项目： NeuroVault/scrapi

 def schema(self):
     return updated_schema(self._schema, {
         'contributors': ('//dc:creator/node()', '//dc:contributor/node()', aoi_process_contributors_bhl)
     })

示例#27

0

显示文件

 def _schema(self):
     return updated_schema(OAISCHEMA, self.formatted_properties)

示例#28

0

显示文件

 def schema(self):
     return helpers.updated_schema(
         self._schema, {
             "uris": ('//dc:identifier/node()', '//dc:relation/node()',
                      helpers.oai_process_uris)
         })

示例#29

0

显示文件

文件： utils.py 项目： jeffreyliu3230/scrapi

            the dietary patterns of eight free-ranging vervet monkey\
            (Chlorocebus pygerythrus) groups in South Africa using stable\
            isotope analysis.',
    'providerUpdatedDateTime':
    '2015-02-23T00:00:00',
    'shareProperties': {
        'source': 'test'
    }
}

TEST_SCHEMA = updated_schema(
    DOESCHEMA, {
        "title": ("//dc:title/node()", lambda x: "Title overwritten"),
        "otherProperties":
        build_properties(
            ("title1", ("//dc:title/node()", single_result)),
            ("title2",
             ("//dc:title/node()", lambda x: single_result(x).lower())),
            ("title3",
             ("//dc:title/node()", "//dc:title/node()",
              lambda x, y: single_result(x) + single_result(y).lower())))
    })

TEST_NAMESPACES = {
    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
    'dc': 'http://purl.org/dc/elements/1.1/',
    'dcq': 'http://purl.org/dc/terms/'
}

TEST_XML_DOC = '''
    <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcq="http://purl.org/dc/terms/">
        <records count="97" morepages="true" start="1" end="10">

示例#30

0

显示文件

文件： smithsonian.py 项目： NeuroVault/scrapi

 def schema(self):
     return helpers.updated_schema(self._schema, {
         "uris": ('//dc:identifier/node()', helpers.oai_process_uris)
     })

示例#31

0

显示文件

文件： umontreal.py 项目： zamattiac/scrapi

 def schema(self):
     return updated_schema(self._schema, {
         'languages': ('//dc:language/node()', umontreal_language_processor)
     })

示例#32

0

显示文件

文件： utils.py 项目： Eleonore9/scrapi

    # },
    'description': 'This study seeks to understand how humans impact\
            the dietary patterns of eight free-ranging vervet monkey\
            (Chlorocebus pygerythrus) groups in South Africa using stable\
            isotope analysis.',
    'providerUpdatedDateTime': '2015-02-23T00:00:00',
    'shareProperties': {
        'source': 'crossref'
    }
}


TEST_SCHEMA = updated_schema(BASEXMLSCHEMA, {
    "title": ("//dc:title/node()", lambda x: "Title overwritten"),
    # "otherProperties": {
    #     "title1": "//dc:title/node()",
    #     "title2": ["//dc:title/node()", lambda x: x.lower()],
    #     "title3": ["//dc:title/node()", "//dc:title/node()", lambda x, y: x + y.lower()]
    # }
})


def get_leaves(d, leaves=None):
    if leaves is None:
        leaves = []

    for k, v in d.items():
        if isinstance(v, dict):
            leaves.extend(get_leaves(v, leaves))
        else:
            leaves.append((k, v))

示例#33

0

显示文件

文件： pcurio.py 项目： AndrewSallans/scrapi

 def schema(self):
     return helpers.updated_schema(self._schema, {
         "uris": ('//ns0:header/ns0:identifier/node()', '//dc:identifier/node()', oai_process_pcurio)
     })

示例#34

0

显示文件

文件： pubmedcentral.py 项目： zamattiac/scrapi

 def schema(self):
     return helpers.updated_schema(
         self._schema, {
             "uris": ('//ns0:header/ns0:identifier/node()',
                      '//dc:identifier/node()', format_uris_pubmedcentral)
         })

示例#35

0

显示文件

 def schema(self):
     return helpers.updated_schema(
         self._schema,
         {'description': ('//dc:description/node()', second_result)})

示例#36

0

显示文件

文件： mblwhoilibrary.py 项目： kms6bn/scrapi

 def schema(self):
     return helpers.updated_schema(
         self._schema, {"uris": ("//dc:identifier/node()", "//dc:relation/node()", helpers.oai_process_uris)}
     )

示例#37

0

显示文件

文件： pubmedcentral.py 项目： AndrewSallans/scrapi

 def schema(self):
     return helpers.updated_schema(self._schema, {
         "uris": ('//ns0:header/ns0:identifier/node()', '//dc:identifier/node()', format_uris_pubmedcentral)
     })

示例#38

0

显示文件

文件： dryad.py 项目： erinspace/scrapi

 def schema(self):
     return helpers.updated_schema(self._schema, {
         "uris": {
             "objectUris": ('//dc:relation/node()', '//dc:identifier/node()', format_dois_dryad)
         }
     })