Пример #1
0
    def get_all_records(self):
        """Retrieves all available OAI records.

        Records are retrieved by first requesting identifiers via the
        ``ListIdentifiers`` verb. For each identifier, the record is
        requested by using the ``GetRecord`` verb.

        :returns: a generator that yields a tuple for each record,
            a tuple consists of the content-type and the content as a string.
        """
        resumption_token = None
        while True:
            req_params = {'verb': 'ListRecords'}
            if resumption_token:
                req_params['resumptionToken'] = resumption_token

            req_params['metadataPrefix'] = self.metadata_prefix

            resp = self.oai_call(req_params)
            tree = self.parse_oai_response(resp)

            records = tree.xpath('.//oai:ListRecords/oai:record',
                                 namespaces=self.namespaces)
            for record in records:
                yield 'application/xml', etree.tostring(record)

            resumption_token = tree.find('.//oai:resumptionToken',
                                         namespaces=self.namespaces).text

            # According to the OAI spec, we reached the last page of the
            # list if the 'resumptionToken' element is empty
            if not resumption_token:
                log.debug('resumptionToken empty, done fetching list')
                break
Пример #2
0
    def adlib_search_call(self, params={}):
        """Makes a call to the Adlib endpoint and returns the response
        as a string.

        :type params: dict
        :param params: a dictonary sent as arguments in the query string
        :rtype: lxml.etree
        """
        default_params = {
            'database': self.adlib_database,
            'search': self.adlib_query,
            'xmltype': self.adlib_xmltype,
            'limit': self.per_page_limit,
            'startfrom': 0
        }

        default_params.update(params)

        log.debug('Getting %s (params: %s)' % (self.adlib_base_url, default_params))
        r = self.http_session.get(
            self.adlib_base_url,
            params=default_params
        )
        r.raise_for_status()

        return etree.fromstring(r.content)
Пример #3
0
    def get_all_records(self):
        """Retrieves all available OAI records. This method has to be
        specifically overwritten for OpenBeelden, as they encode the
        metadataPrefix in their resumption token, rather than having a
        separate HTTP GET parameter.
        """
        resumption_token = None
        while True:
            req_params = {'verb': 'ListRecords'}
            if resumption_token:
                req_params['resumptionToken'] = resumption_token
            # This fixes the culprit
            else:
                req_params['metadataPrefix'] = self.metadata_prefix

            resp = self.oai_call(req_params)
            tree = self.parse_oai_response(resp)

            records = tree.xpath('.//oai:ListRecords/oai:record',
                                 namespaces=self.namespaces)
            for record in records:
                yield 'application/xml', etree.tostring(record)

            resumption_token = tree.find('.//oai:resumptionToken',
                                         namespaces=self.namespaces).text

            # According to the OAI spec, we reached the last page of the
            # list if the 'resumptionToken' element is empty
            if not resumption_token:
                log.debug('resumptionToken empty, done fetching list')
                break
Пример #4
0
    def get_all_records(self):
        """Retrieves all available OAI records. This method has to be
        specifically overwritten for OpenBeelden, as they encode the
        metadataPrefix in their resumption token, rather than having a
        separate HTTP GET parameter.
        """
        resumption_token = None
        while True:
            req_params = {'verb': 'ListRecords'}
            if resumption_token:
                req_params['resumptionToken'] = resumption_token
            # This fixes the culprit
            else:
                req_params['metadataPrefix'] = self.metadata_prefix

            resp = self.oai_call(req_params)
            tree = self.parse_oai_response(resp)

            records = tree.xpath('.//oai:ListRecords/oai:record',
                                 namespaces=self.namespaces)
            for record in records:
                yield 'application/xml', etree.tostring(record)

            resumption_token = tree.find('.//oai:resumptionToken',
                                         namespaces=self.namespaces).text

            # According to the OAI spec, we reached the last page of the
            # list if the 'resumptionToken' element is empty
            if not resumption_token:
                log.debug('resumptionToken empty, done fetching list')
                break
Пример #5
0
    def get_all_records(self):
        """Retrieves all available OAI records.

        Records are retrieved by first requesting identifiers via the
        ``ListIdentifiers`` verb. For each identifier, the record is
        requested by using the ``GetRecord`` verb.

        :returns: a generator that yields a tuple for each record,
            a tuple consists of the content-type and the content as a string.
        """
        resumption_token = None
        while True:
            req_params = {'verb': 'ListRecords'}
            if resumption_token:
                req_params['resumptionToken'] = resumption_token

            req_params['metadataPrefix'] = self.metadata_prefix

            resp = self.oai_call(req_params)
            tree = self.parse_oai_response(resp)

            records = tree.xpath('.//oai:ListRecords/oai:record',
                                 namespaces=self.namespaces)
            for record in records:
                yield 'application/xml', etree.tostring(record)

            resumption_token = tree.find('.//oai:resumptionToken',
                                         namespaces=self.namespaces).text

            # According to the OAI spec, we reached the last page of the
            # list if the 'resumptionToken' element is empty
            if not resumption_token:
                log.debug('resumptionToken empty, done fetching list')
                break
Пример #6
0
    def opensearch_call(self, params={}):
        """Makes a call to the Opensearch endpoint and returns an XML tree.

        :type params: dict
        :param params: a dictonary sent as arguments in the query string
        :rtype: lxml.etree
        """

        log.debug('Getting %s (params: %s)' % (self.url, params))

        r = self.http_session.get(self.url, params=params)

        # In case a server error is returned (for example, a gateway
        # time-out), we retry the same request for a number of times
        max_retries = 10
        retried = 0
        while r.status_code >= 500 and retried <= max_retries:
            log.warning('Received server error (status %s), retry %s of %s' %
                        (r.status_code, retried + 1, max_retries))

            sleep_s = retried + 1
            log.warning('Sleeping %s second(s) before retrying...' % sleep_s)
            sleep(sleep_s)

            r = self.http_session.get(self.url, params=params)

            retried += 1

        r.raise_for_status()

        return etree.fromstring(r.content)
Пример #7
0
    def call(self, url, headers, data):
        log.debug('Getting %s (headers: %s, data: %s)' %
                  (self.url, headers, data))

        r = requests.post(self.url, data=data, headers=headers)

        # In case a server error is returned (for example, a gateway
        # time-out), we retry the same request for a number of times
        max_retries = 10
        retried = 0
        while r.status_code >= 500 and retried <= max_retries:
            log.warning('Received server error (status %s), retry %s of %s' %
                        (r.status_code, retried + 1, max_retries))

            sleep_s = retried + 1
            log.warning('Sleeping %s second(s) before retrying...' % sleep_s)
            sleep(sleep_s)

            r = requests.post(url, data=data, headers=headers)

            retried += 1

        r.raise_for_status()

        return r.json()
Пример #8
0
    def opensearch_call(self, params={}):
        """Makes a call to the Opensearch endpoint and returns an XML tree.

        :type params: dict
        :param params: a dictonary sent as arguments in the query string
        :rtype: lxml.etree
        """

        log.debug('Getting %s (params: %s)' % (self.url, params))

        r = self.http_session.get(self.url, params=params)

        # In case a server error is returned (for example, a gateway
        # time-out), we retry the same request for a number of times
        max_retries = 10
        retried = 0
        while r.status_code >= 500 and retried <= max_retries:
            log.warning('Received server error (status %s), retry %s of %s'
                            % (r.status_code, retried + 1, max_retries))

            sleep_s = retried + 1
            log.warning('Sleeping %s second(s) before retrying...' % sleep_s)
            sleep(sleep_s)

            r = self.http_session.get(self.url, params=params)

            retried += 1

        r.raise_for_status()

        return etree.fromstring(r.content)
Пример #9
0
    def api_call(self, url, params={}):
        params.update(key=self.source_definition['rijksmuseum_api_key'],
                      format='json')
        url = '%s%s' % (self.api_base_url, url)

        log.debug('Getting %s (params: %s)' % (url, params))
        r = self.http_session.get(url, params=params)
        r.raise_for_status()

        return r.json()
Пример #10
0
    def api_call(self, url, params={}):
        params.update(key=self.source_definition['rijksmuseum_api_key'],
                      format='json')
        url = '%s%s' % (self.api_base_url, url)

        log.debug('Getting %s (params: %s)' % (url, params))
        r = self.http_session.get(url, params=params)
        r.raise_for_status()

        return r.json()
Пример #11
0
    def oai_call(self, params={}):
        """Makes a call to the OAI endpoint and returns the response as
        a string.

        :type params: dict
        :param params: a dictonary sent as arguments in the query string
        """
        log.debug('Getting %s (params: %s)' % (self.oai_base_url, params))
        r = self.http_session.get(self.oai_base_url, params=params)
        r.raise_for_status()

        return r.content
Пример #12
0
    def oai_call(self, params={}):
        """Makes a call to the OAI endpoint and returns the response as
        a string.

        :type params: dict
        :param params: a dictonary sent as arguments in the query string
        """
        log.debug('Getting %s (params: %s)' % (self.oai_base_url, params))
        r = self.http_session.get(self.oai_base_url, params=params)
        r.raise_for_status()

        return r.content
Пример #13
0
    def api_call(self, cursor, params={}):
        params.update(
            wskey=self.source_definition['api_key'],
            query='DATA_PROVIDER%3A%22Benaki+Museum%22',
            cursor=cursor
        )
        url = '%s?wskey=%s&query=%s&cursor=%s' % (self.api_base_url, self.source_definition['api_key'], 'DATA_PROVIDER%3A%22Benaki+Museum%22', cursor)

        log.debug('Getting %s (params: %s)' % (url, params))
        #r = self.http_session.get(url, params=params)
        r = self.http_session.get(url)
        r.raise_for_status()

        return r.json()
Пример #14
0
    def opensearch_call(self, params={}):
        """Makes a call to the Opensearch endpoint and returns an XML tree.

        :type params: dict
        :param params: a dictonary sent as arguments in the query string
        :rtype: lxml.etree
        """

        log.debug('Getting %s (params: %s)' % (self.url, params))

        r = self.http_session.get(self.url, params=params)
        r.raise_for_status()

        return etree.fromstring(r.content)
Пример #15
0
    def opensearch_call(self, params={}):
        """Makes a call to the Opensearch endpoint and returns an XML tree.

        :type params: dict
        :param params: a dictonary sent as arguments in the query string
        :rtype: lxml.etree
        """

        log.debug('Getting %s (params: %s)' % (self.url, params))

        r = self.http_session.get(self.url, params=params)
        r.raise_for_status()

        return etree.fromstring(r.content)
Пример #16
0
    def commons_api_call(self, image_name):
        """Use the Wikimedia Commons API to retrieve media metadata from
        Commons as XML. The response is returned as a string.

        :type image_name: str
        :param image_name: the title of the Commons page containing the
                           image (e.g. ``File:Studioportretten.jpg``)
        """
        params = {
            'image': image_name,
            'forcehtml': '',
        }

        log.debug('Getting %s (params: %s)' % (self.commons_api_url, params))
        r = self.http_session.get(self.commons_api_url, params=params)
        r.raise_for_status()

        return r.content
Пример #17
0
    def get_all_records(self):
        """Retrieves all available OAI records.

        :returns: a generator that yields a tuple for each record,
            a tuple consists of the content-type and the content as a string.
        """
        resumption_token = None
        while True:
            req_params = {'verb': 'ListRecords'}
            if resumption_token:
                req_params['resumptionToken'] = resumption_token

            req_params['metadataPrefix'] = self.metadata_prefix

            resp = self.oai_call(req_params)
            tree = self.parse_oai_response(resp)

            records = tree.xpath('.//oai:ListRecords/oai:record',
                                 namespaces=self.namespaces)
            for record in records:
                # check if the record was deleted
                header = record.find('oai:header[@status="deleted"]',
                                     namespaces=self.namespaces)
                if header is not None:
                    log.debug(
                        'Header specifies that the record is deleted, skipping.'
                    )
                    continue

                yield 'application/xml', etree.tostring(record)

            # According to the OAI spec, we reached the last page of the
            # list if the 'resumptionToken' element is empty. Some OAI
            # implementations completely drop the 'resumptionToken'
            # element on the last
            try:
                resumption_token = tree.find('.//oai:resumptionToken',
                                             namespaces=self.namespaces).text
            except AttributeError:
                resumption_token = None

            if not resumption_token:
                log.debug('resumptionToken empty, done fetching list')
                break
Пример #18
0
    def get_all_records(self):
        """Retrieves all available OAI records.

        :returns: a generator that yields a tuple for each record,
            a tuple consists of the content-type and the content as a string.
        """
        resumption_token = None
        while True:
            req_params = {'verb': 'ListRecords'}
            if resumption_token:
                req_params['resumptionToken'] = resumption_token

            req_params['metadataPrefix'] = self.metadata_prefix

            resp = self.oai_call(req_params)
            tree = self.parse_oai_response(resp)

            records = tree.xpath('.//oai:ListRecords/oai:record',
                                 namespaces=self.namespaces)
            for record in records:
                # check if the record was deleted
                header = record.find('oai:header[@status="deleted"]',
                                     namespaces=self.namespaces)
                if header is not None:
                    log.debug('Header specifies that the record is deleted, skipping.')
                    continue

                yield 'application/xml', etree.tostring(record)


            # According to the OAI spec, we reached the last page of the
            # list if the 'resumptionToken' element is empty. Some OAI
            # implementations completely drop the 'resumptionToken'
            # element on the last
            try:
                resumption_token = tree.find('.//oai:resumptionToken',
                                             namespaces=self.namespaces).text
            except AttributeError:
                resumption_token = None

            if not resumption_token:
                log.debug('resumptionToken empty, done fetching list')
                break
Пример #19
0
    def oai_call(self, params={}):
        """Makes a call to the OAI endpoint and returns the response as
        a string.

        :type params: dict
        :param params: a dictonary sent as arguments in the query string
        """
        # Add the set variable to the parameters (if available)
        if self.oai_set:
            params['set'] = self.oai_set

        # Remove set and metadataPrefix, when a resumptionToken is present
        if 'resumptionToken' in params:
            if 'set' in params:
                del params['set']

        log.debug('Getting %s (params: %s)' % (self.oai_base_url, params))
        r = self.http_session.get(self.oai_base_url, params=params)
        r.raise_for_status()

        return r.content
Пример #20
0
    def wikimedia_api_call(self, params={}):
        """Calls the MediaWiki API and returns the response as a string.

        :type params: dict
        :param params: a dictonary sent as arguments in the query string
        """
        req_params = {
            'action': 'query',
            'list': 'categorymembers',
            'cmtype': 'file',
            'cmtitle': self.wikimedia_category,
            'cmlimit': 250,
            'format': 'xml'
        }
        req_params.update(params)

        log.debug('Getting %s (params: %s)' % (self.base_url, params))
        r = self.http_session.get(self.base_url, params=req_params)
        r.raise_for_status()

        return r.content
Пример #21
0
    def oai_call(self, params={}):
        """Makes a call to the OAI endpoint and returns the response as
        a string.

        :type params: dict
        :param params: a dictionary sent as arguments in the query string
        """
        # Add the set variable to the parameters (if available)
        if self.oai_set:
            params['set'] = self.oai_set

        # Remove set and metadataPrefix, when a resumptionToken is present
        if 'resumptionToken' in params:
            if 'set' in params:
                del params['set']
            if 'metadataPrefix' in params:
                del params['metadataPrefix']

        log.debug('Getting %s (params: %s)' % (self.oai_base_url, params))
        r = self.http_session.get(self.oai_base_url, params=params)
        r.raise_for_status()

        return r.content
Пример #22
0
    def get_all_records(self):
        cmcontinue = None

        while True:
            req_params = {}
            if cmcontinue:
                req_params['cmcontinue'] = cmcontinue

            # Get the file pages in the specified Wiki category
            file_pages = etree.fromstring(self.wikimedia_api_call(req_params))

            # Request the metadata of each page
            for file_page in file_pages.findall('.//cm'):
                page_title = file_page.attrib['title']

                page_meta = self.commons_api_call(page_title)
                page_meta_tree = etree.fromstring(page_meta)

                # Skip this page if the response contains errors (the Commons
                # API doesn't return proper HTTP status codes)
                page_meta_error = page_meta_tree.find('.//error')
                if page_meta_error:
                    log.warning('Skipping "%s" because of Commons API error: %s'
                                % (page_title, page_meta_error.text))
                    continue

                yield 'application/xml', page_meta

            try:
                cmcontinue = file_pages.xpath('.//query-continue/categorymembers/@cmcontinue')[0]
            except IndexError:
                cmcontinue = None

            # When cmcontinue is empty or None, we've reached the last page
            if not cmcontinue:
                log.debug('cmcontinue empty, done fetching category pages')
                break
Пример #23
0
    def call(self, url, headers, data):
        log.debug('Getting %s (headers: %s, data: %s)' % (self.url, headers, data))

	r = requests.post(self.url, data=data, headers=headers)

        # In case a server error is returned (for example, a gateway
        # time-out), we retry the same request for a number of times
        max_retries = 10
        retried = 0
        while r.status_code >= 500 and retried <= max_retries:
            log.warning('Received server error (status %s), retry %s of %s'
                            % (r.status_code, retried + 1, max_retries))

            sleep_s = retried + 1
            log.warning('Sleeping %s second(s) before retrying...' % sleep_s)
            sleep(sleep_s)

	    r = requests.post(url, data=data, headers=headers)

            retried += 1

        r.raise_for_status()

        return r.json()
Пример #24
0
    def adlib_search_call(self, params={}):
        """Makes a call to the Adlib endpoint and returns the response
        as a string.

        :type params: dict
        :param params: a dictonary sent as arguments in the query string
        :rtype: lxml.etree
        """
        default_params = {
            'database': self.adlib_database,
            'search': self.adlib_query,
            'xmltype': self.adlib_xmltype,
            'limit': self.per_page_limit,
            'startfrom': 0
        }

        default_params.update(params)

        log.debug('Getting %s (params: %s)' %
                  (self.adlib_base_url, default_params))
        r = self.http_session.get(self.adlib_base_url, params=default_params)
        r.raise_for_status()

        return etree.fromstring(r.content)
Пример #25
0
    def test_download_results(self):
	extractor = ArtsHollandExtractor({ 'url': 'http://api.artsholland.com/sparql' })            
	for result in extractor.get_all_results():
		log.debug ("result %s %s", result[0], result[1])
		pass
Пример #26
0
 def test_download_results(self):
     extractor = ArtsHollandExtractor(
         {'url': 'http://api.artsholland.com/sparql'})
     for result in extractor.get_all_results():
         log.debug("result %s %s", result[0], result[1])
         pass