def get_dynamic(): url = 'http://example.webscraping.com/dynamic' D = Downloader() content = D(url) #print(content.decode('utf-8')) tree = html.fromstring(content) print(tree.cssselect('#results')[0].text_content())
def search1(): template_url = 'http://example.webscraping.com/ajax/search.json?page={}&page_size=10&search_term={}' countries = set() download = Downloader(cache=MongoCache()) for letter in string.ascii_lowercase: page = 0 while True: url=template_url.format(page, letter) print("URL: ", url) html = download(url) try: ajax = json.loads(html) except ValueError as e: print(e) ajax = None else: for record in ajax['records']: countries.add(record['country']) page += 1 if ajax is None or page >= ajax['num_pages']: break open('countries.txt', 'w').write('\n'.join(sorted(countries)))
def search2(): writer = csv.writer(open('countries.csv', 'w')) D = Downloader() html = D('http://example.webscraping.com/places/default/search?page=0&page_size=1000&search_term=.') print(html.decode('utf-8')) ajax = json.loads(html) for record in ajax['records']: writer.writerow([record['country']])
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60): """Crawl using multiple threads """ # the queue of URL's that still need to be crawled crawl_queue = MongoQueue() crawl_queue.clear() crawl_queue.push(seed_url) D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def process_queue(): while True: # keep track that are processing url try: url = crawl_queue.pop() except KeyError: # currently no urls to process break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print('Error in callback for: {}: {}'.format(url, e)) else: for link in links: # add this new link to queue crawl_queue.push(normalize(seed_url, link)) crawl_queue.complete(url) # wait for all download threads to finish threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek(): # can start some more threads thread = threading.Thread(target=process_queue) thread.setDaemon( True ) # set daemon so main thread can exit when receives ctrl-c thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60): """Crawl this website in multiple threads """ # the queue of URL's that still need to be crawled #crawl_queue = Queue.deque([seed_url]) print(seed_url) crawl_queue = [seed_url] # the URL's that have been seen seen = set([seed_url]) # a set D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def process_queue(): while True: try: url = crawl_queue.pop() except IndexError: # crawl queue is empty break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print('Error in callback for: {}: {}'.format(url, e)) else: for link in links: link = normalize(seed_url, link) # check whether already crawled this link if link not in seen: seen.add(link) # add this new link to queue crawl_queue.append(link) # wait for all download threads to finish threads = [] while threads or crawl_queue: # the crawl is still active for thread in threads: if not thread.is_alive(): # remove the stopped threads threads.remove(thread) while len(threads) < max_threads and crawl_queue: # can start some more threads thread = threading.Thread(target=process_queue) thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c thread.start() threads.append(thread) # all threads have been processed # sleep temporarily so CPU can focus execution on other threads time.sleep(SLEEP_TIME)
def alexa(): D = Downloader() zipped_data = D('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip') print(type(zipped_data)) urls = [] # top 1 million URL's will be stored in this list with ZipFile(BytesIO(zipped_data)) as zf: csv_filename = zf.namelist()[0] print(csv_filename) mess = zf.open(csv_filename) for website in mess.readlines(): webstr = website.decode('utf-8').replace("\n", "").split(',')[1] print(webstr) urls.append('http://' + webstr) return urls
def alexa2(): D = Downloader() zipped_data = D('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip') print(type(zipped_data)) urls = [] # top 1 million URL's will be stored in this list with ZipFile(BytesIO(zipped_data)) as zf: csv_filename = zf.namelist()[0] print(csv_filename) mess = zf.open(csv_filename, 'r') mess = TextIOWrapper(mess) print(mess) for _, website in csv.reader(mess): print(_, website) urls.append('http://' + website) return urls
def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, user_agent='wswp', proxies=None, num_retries=1, scrape_callback=None, cache=None, ignore_robots=False): """Crawl from the given seed URL following links matched by link_regex """ # the queue of URL's that still need to be crawled crawl_queue = [seed_url] # the URL's that have been seen and at what depth seen = {seed_url: 0} # track how many URL's have been downloaded num_urls = 0 rp = get_robots(seed_url) D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, cache=cache) while crawl_queue: url = crawl_queue.pop() depth = seen[url] # check url passes robots.txt restrictions if ignore_robots or rp.can_fetch(user_agent, url): html = D(url) links = [] if scrape_callback: links.extend(scrape_callback(url, html) or []) if depth != max_depth: # can still crawl further if link_regex: # filter for links matching our regular expression links.extend(link for link in get_links(html) if re.match(link_regex, link)) for link in links: link = normalize(seed_url, link) # check whether already crawled this link if link not in seen: seen[link] = depth + 1 # check link is within same domain if same_domain(seed_url, link): # success! add this new link to queue crawl_queue.append(link) # check whether have reached downloaded maximum num_urls += 1 if num_urls == max_urls: break else: print('Blocked by robots.txt:', url)
def direct_download_ajax(): D=Downloader() html=D('http://example.webscraping.com/ajax/') content=json.loads(html.decode('utf-8')) print(content)
def fail_search(): D=Downloader() html=D('http://example.webscraping.com/search') tree=lxml.html.fromstring(html) tree.cssselect('div#results a')