Exemplo n.º 1
0
    def run(self):
        if not self.isil == 'DE-15':
            raise RuntimeError('not implemented except for DE-15')

        cache = URLCache(directory=os.path.join(tempfile.gettempdir(), '.urlcache'))
        adapter = requests.adapters.HTTPAdapter(max_retries=3)
        cache.sess.mount('http://', adapter)

        finc = self.config.get('ai', 'finc-solr')
        ai = self.config.get('ai', 'ai-solr')

        with self.input().open() as handle:
            with self.output().open('w') as output:
                for i, row in enumerate(handle.iter_tsv(cols=('issn', 'status'))):
                    if row.status == 'NOT_FOUND':
                        link = '%s/select?q=institution:%s+AND+issn:%s&wt=json' % (finc, self.isil, row.issn)
                        self.logger.info('fetch #%05d: %s' % (i, link))
                        body = cache.get(link)
                        content = json.loads(body)
                        output.write_tsv('finc', row.issn, content['response']['numFound'], link)
                    else:
                        link = '%s/select?q=institution:%s+AND+issn:%s&wt=json' % (ai, self.isil, row.issn)
                        self.logger.info('fetch #%05d: %s' % (i, link))
                        body = cache.get(link)
                        content = json.loads(body)
                        output.write_tsv('ai', row.issn, content['response']['numFound'], link)
Exemplo n.º 2
0
    def run(self):
        cache = URLCache(directory=os.path.join(tempfile.gettempdir(), '.urlcache'))
        adapter = requests.adapters.HTTPAdapter(max_retries=self.max_retries)
        cache.sess.mount('http://', adapter)

        filter = "from-{self.filter}-date:{self.begin},until-{self.filter}-date:{self.end}".format(self=self)
        rows, offset = self.rows, 0

        with self.output().open('w') as output:
            while True:
                params = {"rows": rows, "offset": offset, "filter": filter}
                url = 'http://api.crossref.org/works?%s' % (urllib.urlencode(params))
                for attempt in range(1, 3):
                    body = cache.get(url)
                    try:
                        content = json.loads(body)
                    except ValueError as err:
                        if attempt == 2:
                            self.logger.debug("URL was %s" % url)
                            self.logger.debug(err)
                            self.logger.debug(body[:100])
                            raise
                        if os.path.exists(cache.get_cache_file(url)):
                            self.logger.debug("trying to recover by removing cached entry")
                            os.remove(cache.get_cache_file(url))
                    else:
                        break
                items = content["message"]["items"]
                self.logger.debug("%s: %s" % (url, len(items)))
                if len(items) == 0:
                    break
                output.write(body + "\n")
                offset += rows
Exemplo n.º 3
0
    def run(self):
        if not self.isil == 'DE-15':
            raise RuntimeError('not implemented except for DE-15')

        cache = URLCache(directory=os.path.join(tempfile.gettempdir(), '.urlcache'))
        adapter = requests.adapters.HTTPAdapter(max_retries=3)
        cache.sess.mount('http://', adapter)

        with self.input().open() as handle:
            with self.output().open('w') as output:
                for i, row in enumerate(handle.iter_tsv(cols=('issn', 'status'))):
                    if row.status == 'NOT_FOUND':
                        link = 'https://katalog.ub.uni-leipzig.de/Search/Results?lookfor=%s&type=ISN' % row.issn
                        self.logger.info('fetch #%05d: %s' % (i, link))
                        body = cache.get(link)
                        if 'Keine Ergebnisse!' in body:
                            output.write_tsv(row.issn, 'ERR_NOT_IN_CATALOG', link)
                        else:
                            soup = BeautifulSoup(body)
                            rs = soup.findAll("div", {"class" : "floatleft"})
                            if len(rs) == 0:
                                output.write_tsv(row.issn, 'ERR_LAYOUT', link)
                                continue
                            first = rs[0]
                            match = re.search(r'Treffer([0-9]+)-([0-9]+)von([0-9]+)', first.text)
                            if match:
                                total = match.group(3)
                                output.write_tsv(row.issn, 'FOUND_RESULTS_%s' % total, link)
                            else:
                                output.write_tsv(row.issn, 'ERR_NO_MATCH', link)
Exemplo n.º 4
0
    def run(self):
        if not self.isil == 'DE-15':
            raise RuntimeError('not implemented except for DE-15')

        cache = URLCache(directory=os.path.join(tempfile.gettempdir(), '.urlcache'))
        adapter = requests.adapters.HTTPAdapter(max_retries=3)
        cache.sess.mount('http://', adapter)

        with self.input().open() as handle:
            with self.output().open('w') as output:
                for i, row in enumerate(handle.iter_tsv(cols=('issn', 'status'))):
                    if row.status == 'NOT_FOUND':
                        link = 'https://katalog.ub.uni-leipzig.de/Search/Results?lookfor=%s&type=ISN' % row.issn
                        self.logger.info('fetch #%05d: %s' % (i, link))
                        body = cache.get(link)
                        if 'Keine Ergebnisse!' in body:
                            output.write_tsv(row.issn, 'ERR_NOT_IN_CATALOG', link)
                        else:
                            soup = BeautifulSoup(body)
                            rs = soup.findAll("div", {"class": "floatleft"})
                            if len(rs) == 0:
                                output.write_tsv(row.issn, 'ERR_LAYOUT', link)
                                continue
                            first = rs[0]
                            match = re.search(r'Treffer([0-9]+)-([0-9]+)von([0-9]+)', first.text)
                            if match:
                                total = match.group(3)
                                output.write_tsv(row.issn, 'FOUND_RESULTS_%s' % total, link)
                            else:
                                output.write_tsv(row.issn, 'ERR_NO_MATCH', link)
Exemplo n.º 5
0
    def run(self):
        if self.isil != 'DE-15':
            raise RuntimeError('not implemented except for DE-15')

        cache = URLCache(directory=os.path.join(tempfile.gettempdir(), '.urlcache'))
        adapter = requests.adapters.HTTPAdapter(max_retries=3)
        cache.sess.mount('http://', adapter)

        finc = self.config.get('ai', 'finc-solr')
        ai = self.config.get('ai', 'ai-solr')

        def numFound(link):
            """ Given a SOLR query URL, return the number of docs found. """
            body = cache.get(link)
            content = json.loads(body)
            return content['response']['numFound']

        with self.input().open() as handle:
            with self.output().open('w') as output:
                for i, row in enumerate(handle.iter_tsv(cols=('issn', 'status'))):
                    if row.status == 'NOT_FOUND':
                        link = '%s/select?q=institution:%s+AND+issn:%s&wt=json' % (finc, self.isil, row.issn)
                        self.logger.info('fetch #%05d: %s', i, link)
                        output.write_tsv('finc', row.issn, numFound(link), link)
                    else:
                        link = '%s/select?q=institution:%s+AND+issn:%s&wt=json' % (ai, self.isil, row.issn)
                        self.logger.info('fetch #%05d: %s', i, link)
                        output.write_tsv('ai', row.issn, numFound(link), link)
Exemplo n.º 6
0
    def run(self):
        """
        > Using large offset values can result in extremely long response times. Offsets
        in the 100,000s and beyond will likely cause a timeout before the API is able
        to respond. An alternative to paging through very large result sets (like a
        corpus used for text and data mining) it to use the API's exposure of Solr's
        deep paging cursors. Any combination of query, filters and facets may be used
        with deep paging cursors. While rows may be specified along with cursor,
        offset and sample cannot be used. To use deep paging make a query as normal,
        but include the cursor parameter with a value of * (https://git.io/vFyAn).

        > But we prefer carrots to sticks. As of September 18th 2017 any API
        queries that use HTTPS and have appropriate contact information will be
        directed to a special pool of API machines that are reserved for polite
        users. (https://git.io/vFyN5), refs #9059.
        """
        cache = URLCache(
            directory=os.path.join(tempfile.gettempdir(), '.urlcache'))
        adapter = requests.adapters.HTTPAdapter(max_retries=self.max_retries)
        cache.sess.mount('http://', adapter)

        filter = "from-{self.filter}-date:{self.begin},until-{self.filter}-date:{self.end}".format(
            self=self)
        rows, offset = self.rows, 0
        cursor = '*'

        with self.output().open('w') as output:
            params = {'rows': rows, 'filter': filter}

            while True:
                params['cursor'] = cursor

                # Do not fail, if user has not configured mailto, https://git.io/vFyN5.
                if self.config.get('crossref', 'mailto', fallback=None):
                    params['mailto'] = self.config.get('crossref', 'mailto')

                url = 'https://api.crossref.org/works?%s' % (
                    urllib.parse.urlencode(params))

                for attempt in range(1, self.attempts):
                    if not cache.is_cached(url):
                        time.sleep(self.sleep)
                    body = cache.get(url)
                    try:
                        content = json.loads(body)
                    except ValueError as err:
                        if attempt == self.attempts - 1:
                            self.logger.debug('URL was %s', url)
                            self.logger.debug(err)
                            self.logger.debug(body[:100] + '...')
                            raise

                        cache_file = cache.get_cache_file(url)
                        if os.path.exists(cache_file):
                            self.logger.debug(
                                'trying to recover by removing cached entry at %s',
                                cache_file)
                            os.remove(cache_file)
                    else:
                        break

                count = len(content['message']['items'])
                self.logger.debug("%s: %s", url, count)
                if count == 0:
                    break

                body = body + '\n'
                if isinstance(body, string_types):
                    body = body.encode('utf-8')
                output.write(body)
                offset += rows

                if not 'next-cursor' in content['message']:
                    raise RuntimeError('missing key: next-cursor')
                cursor = content['message']['next-cursor']
Exemplo n.º 7
0
    def run(self):
        """
        > Using large offset values can result in extremely long response times. Offsets
        in the 100,000s and beyond will likely cause a timeout before the API is able
        to respond. An alternative to paging through very large result sets (like a
        corpus used for text and data mining) it to use the API's exposure of Solr's
        deep paging cursors. Any combination of query, filters and facets may be used
        with deep paging cursors. While rows may be specified along with cursor,
        offset and sample cannot be used. To use deep paging make a query as normal,
        but include the cursor parameter with a value of * (https://git.io/vFyAn).

        > But we prefer carrots to sticks. As of September 18th 2017 any API
        queries that use HTTPS and have appropriate contact information will be
        directed to a special pool of API machines that are reserved for polite
        users. (https://git.io/vFyN5), refs #9059.
        """
        cache = URLCache(directory=os.path.join(tempfile.gettempdir(), '.urlcache'))
        adapter = requests.adapters.HTTPAdapter(max_retries=self.max_retries)
        cache.sess.mount('http://', adapter)

        filter = "from-{self.filter}-date:{self.begin},until-{self.filter}-date:{self.end}".format(self=self)
        rows, offset = self.rows, 0
        cursor = '*'

        with self.output().open('w') as output:
            params = {'rows': rows, 'filter': filter}

            while True:
                params['cursor'] = cursor

                # Do not fail, if user has not configured mailto, https://git.io/vFyN5.
                if self.config.get('crossref', 'mailto', fallback=None):
                    params['mailto'] = self.config.get('crossref', 'mailto')

                url = 'https://api.crossref.org/works?%s' % (urllib.parse.urlencode(params))

                for attempt in range(1, self.attempts):
                    if not cache.is_cached(url):
                        time.sleep(self.sleep)
                    body = cache.get(url)
                    try:
                        content = json.loads(body)
                    except ValueError as err:
                        if attempt == self.attempts - 1:
                            self.logger.debug('URL was %s', url)
                            self.logger.debug(err)
                            self.logger.debug(body[:100] + '...')
                            raise

                        cache_file = cache.get_cache_file(url)
                        if os.path.exists(cache_file):
                            self.logger.debug('trying to recover by removing cached entry at %s', cache_file)
                            os.remove(cache_file)
                    else:
                        break

                count = len(content['message']['items'])
                self.logger.debug("%s: %s", url, count)
                if count == 0:
                    break

                output.write(body + '\n')
                offset += rows

                if not 'next-cursor' in content['message']:
                    raise RuntimeError('missing key: next-cursor')
                cursor = content['message']['next-cursor']
Exemplo n.º 8
0
def test_get_cache_file(tmpdir):
    cache = URLCache(directory=str(tmpdir))
    fn = cache.get_cache_file("http://x.com")
    assert fn.startswith(str(tmpdir))