示例#1
0
def has_urls(data, url=None):
    if data:
        try:
            spider.findall(data, url).next()
            return True
        except StopIteration:
            pass
示例#2
0
def has_urls(data, url=None):
    if data: 
        try:
            spider.findall(data, url).next()
            return True
        except StopIteration:
            pass
示例#3
0
def process_records(queue, rule, wb):
    newqueue = []
    for record in queue:
        maybesave(wb, queue)

        url = record.get("url")
        try:
            (fp, filename) = io.get_tempfile()
            f = fetch.Fetcher(mode=record.get("mode"),
                              url=url,
                              filename=filename)
            url = get_url(f, wb, host_filter=rule.get("host_filter"))
            filename = f.filename

            # consider retrying the fetch if it failed
            if f.error and fetch.err.is_temporal(f.error):
                if not record.get("retry"):
                    record["retry"] = True
                    queue.append(record)

            if record.get("mode") == fetch.Fetcher.SPIDER:
                data = open(filename, 'r').read()
                urls = spider.unbox_it_to_ss(spider.findall(data, url))
                urls = urlrewrite.rewrite_urls(url, urls)

                (newqueue, wb) = qualify_urls(url, urls, rule, newqueue, wb)

            if record.get("mode") == fetch.Fetcher.FETCH:
                shutil.move(filename,
                            io.safe_filename(urlrewrite.url_to_filename(url)))

        except (fetch.DuplicateUrlWarning, fetch.UrlRedirectsOffHost):
            pass
        except KeyboardInterrupt:
            q = queue[queue.index(record):]
            q.extend(newqueue)
            save_session(wb, queue=q)
            sys.exit(1)
        except Exception, exc:
            log_exc(exc, url, wb)
        finally:
示例#4
0
def process_records(queue, rule, wb):
    newqueue = []
    for record in queue:
        maybesave(wb, queue)

        url = record.get("url")
        try:
            (fp, filename) = io.get_tempfile()
            f = fetch.Fetcher(mode=record.get("mode"), url=url, filename=filename)
            url = get_url(f, wb, host_filter=rule.get("host_filter"))
            filename = f.filename

            # consider retrying the fetch if it failed
            if f.error and fetch.err.is_temporal(f.error):
                if not record.get("retry"):
                    record["retry"] = True
                    queue.append(record)

            if record.get("mode") == fetch.Fetcher.SPIDER:
                data = open(filename, 'r').read()
                urls = spider.unbox_it_to_ss(spider.findall(data, url))
                urls = urlrewrite.rewrite_urls(url, urls)

                (newqueue, wb) = qualify_urls(url, urls, rule, newqueue, wb)

            if record.get("mode") == fetch.Fetcher.FETCH:
                shutil.move(filename,
                  io.safe_filename(urlrewrite.url_to_filename(url)))

        except (fetch.DuplicateUrlWarning, fetch.UrlRedirectsOffHost):
            pass
        except KeyboardInterrupt:
            q = queue[queue.index(record):]
            q.extend(newqueue)
            save_session(wb, queue=q)
            sys.exit(1)
        except Exception, exc:
            log_exc(exc, url, wb)
        finally: