def has_urls(data, url=None): if data: try: spider.findall(data, url).next() return True except StopIteration: pass
def process_records(queue, rule, wb): newqueue = [] for record in queue: maybesave(wb, queue) url = record.get("url") try: (fp, filename) = io.get_tempfile() f = fetch.Fetcher(mode=record.get("mode"), url=url, filename=filename) url = get_url(f, wb, host_filter=rule.get("host_filter")) filename = f.filename # consider retrying the fetch if it failed if f.error and fetch.err.is_temporal(f.error): if not record.get("retry"): record["retry"] = True queue.append(record) if record.get("mode") == fetch.Fetcher.SPIDER: data = open(filename, 'r').read() urls = spider.unbox_it_to_ss(spider.findall(data, url)) urls = urlrewrite.rewrite_urls(url, urls) (newqueue, wb) = qualify_urls(url, urls, rule, newqueue, wb) if record.get("mode") == fetch.Fetcher.FETCH: shutil.move(filename, io.safe_filename(urlrewrite.url_to_filename(url))) except (fetch.DuplicateUrlWarning, fetch.UrlRedirectsOffHost): pass except KeyboardInterrupt: q = queue[queue.index(record):] q.extend(newqueue) save_session(wb, queue=q) sys.exit(1) except Exception, exc: log_exc(exc, url, wb) finally: