def crawl_sitemap(url):
    # Download the sitemap file
    sitemap = download(url)

    # Extract the sitemap links
    links = re.findall("<loc>(.*?)</loc>", sitemap)

    # Download each link
    for link in links:
        html = download(link)
示例#2
0
def main():

    args = sys.argv[1:]
    crit = input.criteria()
    input.run(args, crit)

    storage = input.criteria()
    storage.data["department"] = "asd-CHEM"

    with requests.Session() as s:

        scraper.login(s)
        scraper.download(s.get(GOLD_SEARCH_URL), DEFAULT_GOLD_FILE_PATH,
                         "search")
        scraper.post_search(crit, s, "chem3")
        html_extraction.parse_to_file("chem3", pretty=True)
示例#3
0
def link_crawler(seed_url, link_regex):
    """Crawl from the given seed URL following links matched by link_regex
	"""
    crawl_queue = [seed_url]
    while crawl_queue:
        url = crawl_queue.pop()
        html = download(url)

        # Filter for links that match the regex
        for link in get_links(html):
            if re.match(link_regex, link):
                link = urlparse.urljoin(seed_url, link)
                crawl_queue.append(link)
示例#4
0
def main(argv):
    """Run scraper.py in an infinite loop."""
    args = parse_cmdline(argv[1:])
    rsync_url, status, destination, storage_service = scraper.init(args)
    prometheus_client.start_http_server(args.metrics_port)
    # First, clear out any existing cache that can be cleared.
    with UPLOAD_RUNS.time():
        # Upload except for the most recent day on disk.
        retry.api.retry_call(scraper.upload_stale_disk,
                             (args, status, destination, storage_service),
                             exceptions=scraper.RecoverableScraperException)
    # Now, download then upload until we run out of num_runs
    while args.num_runs > 0:
        try:
            logging.info('Scraping %s', rsync_url)
            with RSYNC_RUNS.time():
                scraper.download(args, rsync_url, status, destination)
            with UPLOAD_RUNS.time():
                scraper.upload_if_allowed(args, status, destination,
                                          storage_service)
            SCRAPER_SUCCESS.labels(message='success').inc()
        except scraper.RecoverableScraperException as error:
            logging.error('Scrape and upload failed: %s', error.message)
            SCRAPER_SUCCESS.labels(message=str(error.prometheus_label)).inc()
        # In order to prevent a thundering herd of rsync jobs, we spread the
        # jobs around in a memoryless way.  By choosing our inter-job sleep
        # time from an exponential distribution, we ensure that the resulting
        # time distribution of jobs is Poisson, the one and only memoryless
        # distribution.  The denominator of the fraction in the code below is
        # the mean sleep time in seconds.
        #
        # That said, don't sleep for more than an hour.
        sleep_time = min(random.expovariate(1.0 / args.expected_wait_time),
                         3600)
        logging.info('Sleeping for %g seconds', sleep_time)
        with SLEEPS.time():
            time.sleep(sleep_time)
        args.num_runs -= 1
示例#5
0
from scraper import download
import itertools

# Maximum number of download errors
maximum_errors = 5

# Current number of download errors
number_of_errors = 0

for page in itertools.count(1):
    url = "http://example.webscraping.com/view/-%d" % page
    html = download(url)

    if html is None:
        # Error upon trying to download the webpage
        number_of_errors += 1
        if number_of_errors == maximum_errors:
            break
    else:
        # Result scraping
        number_of_errors = 0