def parse_bookxcess_html(document, headers, filename=None): """Parses Bookxcess book listings page """ soup = BeautifulSoup(document.contents) links = soup.findAll(['a', 'area'], href=True) parsers = { '.htm': parse_bookxcess_html, '.html': parse_bookxcess_html, '.pdf': parse_bookxcess_pdf } urls = {} for link in links: url = link['href'].strip() if not url.startswith('http://'): url = BOOKXCESS + url urlp = urlsplit(url) path = urlp.path.lower() args = { "filename": basename(path) } ext = splitext(path)[1] if ext in parsers: parser = parsers[ext] urls[url] = (parser, args) for url, (parser, args) in urls.items(): task_name = 'download-%s' % Document.hash_url(url) logging.info('parse_bookxcess_html: downloading %s in task %s' % (url, task_name)) try: deferred.defer(download_page, url, callback=parser, args=args, _name=task_name, _queue='downloader') except (taskqueue.TaskAlreadyExistsError, taskqueue.TombstonedTaskError): pass
def get(self): self.response.headers['content-type'] = 'text/plain' if self.request.get('dbg'): self.debug() else: source = self.request.get('source') name = self.request.get('name') urls, call = None, None callbacks = { 'mph_rss': parse_mph_rss, 'bookxcess_pdf': parse_bookxcess_pdf } TASKS = dict(cron.tasks) if source in TASKS: urlset = dict(TASKS[source]['urls']) if name in urlset: urls = urlset[name] call = TASKS[source]['callback'] call = callbacks.get(call, None) if urls and call: for url in urls: urlp = urlsplit(url) path = urlp.path.lower() args = { "_queue": 'downloader', "_name": 'download-%s' % Document.hash_url(url), "callback": call, "args": { "filename": basename(path) } } self.response.out.write("%s\n" % url) try: deferred.defer(download_page, url, **args) except (taskqueue.TaskAlreadyExistsError, taskqueue.TombstonedTaskError): pass else: self.error(500) self.response.out.write('No URLs or callback found: %s, %s' % (urls, call))