Пример #1
0
def parse_bookxcess_html(document, headers, filename=None):
    """Parses Bookxcess book listings page
    """
    soup    = BeautifulSoup(document.contents)
    links   = soup.findAll(['a', 'area'], href=True)
    parsers = {
        '.htm': parse_bookxcess_html,
        '.html': parse_bookxcess_html,
        '.pdf': parse_bookxcess_pdf
    }
    urls = {}

    for link in links:
        url = link['href'].strip()
        if not url.startswith('http://'):
            url = BOOKXCESS + url
        urlp = urlsplit(url)
        path = urlp.path.lower()
        args = {
            "filename": basename(path)
        }
        ext = splitext(path)[1]
        if ext in parsers:
            parser = parsers[ext]
            urls[url] = (parser, args)

    for url, (parser, args) in urls.items():
        task_name = 'download-%s' % Document.hash_url(url)
        logging.info('parse_bookxcess_html: downloading %s in task %s' % (url, task_name))
        try:
            deferred.defer(download_page, url, callback=parser, args=args,
                           _name=task_name, _queue='downloader')
        except (taskqueue.TaskAlreadyExistsError, taskqueue.TombstonedTaskError):
            pass
Пример #2
0
    def get(self):
        self.response.headers['content-type'] = 'text/plain'

        if self.request.get('dbg'):
            self.debug()
        else:
            source = self.request.get('source')
            name = self.request.get('name')
            urls, call = None, None

            callbacks = {
                'mph_rss': parse_mph_rss,
                'bookxcess_pdf': parse_bookxcess_pdf
            }

            TASKS = dict(cron.tasks)

            if source in TASKS:
                urlset = dict(TASKS[source]['urls'])
                if name in urlset:
                    urls = urlset[name]
                    call = TASKS[source]['callback']
                    call = callbacks.get(call, None)

            if urls and call:
                for url in urls:
                    urlp = urlsplit(url)
                    path = urlp.path.lower()
                    args = {
                        "_queue": 'downloader',
                        "_name": 'download-%s' % Document.hash_url(url),
                        "callback": call,
                        "args": {
                            "filename": basename(path)
                        }
                    }
                    self.response.out.write("%s\n" % url)
                    try:
                        deferred.defer(download_page, url, **args)
                    except (taskqueue.TaskAlreadyExistsError, taskqueue.TombstonedTaskError):
                        pass
            else:
                self.error(500)
                self.response.out.write('No URLs or callback found: %s, %s' % (urls, call))