def create_http_session(hostname): sess = requests.Session() cache = FileCache('.webcache') basic_adapter = CacheControlAdapter(cache=cache) forever_adapter = CacheControlAdapter(heuristic=CacheForeverHeuristic(), cache=cache) sess.mount('http://', basic_adapter) sess.mount('https://', basic_adapter) sess.mount('http://www.' + hostname, forever_adapter) sess.mount('https://www.' + hostname, forever_adapter) return sess
import requests import time from selenium import webdriver from requests_file import FileAdapter from ricecooker.utils.caching import CacheForeverHeuristic, FileCache, CacheControlAdapter, InvalidatingCacheControlAdapter DOWNLOAD_SESSION = requests.Session( ) # Session for downloading content from urls DOWNLOAD_SESSION.mount('https://', requests.adapters.HTTPAdapter(max_retries=3)) DOWNLOAD_SESSION.mount('file://', FileAdapter()) cache = FileCache('.webcache') forever_adapter = CacheControlAdapter(heuristic=CacheForeverHeuristic(), cache=cache) DOWNLOAD_SESSION.mount('http://', forever_adapter) DOWNLOAD_SESSION.mount('https://', forever_adapter) def read(path, loadjs=False, session=None, driver=None): """ read: Reads from source and returns contents Args: path: (str) url or local path to download loadjs: (boolean) indicates whether to load js (optional) session: (requests.Session) session to use to download (optional) driver: (selenium.webdriver) webdriver to use to download (optional) Returns: str content from file or page """ session = session or DOWNLOAD_SESSION try: if loadjs: # Wait until js loads then return contents
from urllib.parse import urlparse, urljoin import uuid from bs4 import BeautifulSoup from selenium import webdriver import selenium.webdriver.support.ui as selenium_ui from requests_file import FileAdapter from ricecooker.config import LOGGER, PHANTOMJS_PATH, STRICT from ricecooker.utils.html import download_file from ricecooker.utils.caching import CacheForeverHeuristic, FileCache, CacheControlAdapter, InvalidatingCacheControlAdapter DOWNLOAD_SESSION = requests.Session() # Session for downloading content from urls DOWNLOAD_SESSION.mount('https://', requests.adapters.HTTPAdapter(max_retries=3)) DOWNLOAD_SESSION.mount('file://', FileAdapter()) # use_dir_lock works with all filesystems and OSes cache = FileCache('.webcache', use_dir_lock=True) forever_adapter= CacheControlAdapter(heuristic=CacheForeverHeuristic(), cache=cache) DOWNLOAD_SESSION.mount('http://', forever_adapter) DOWNLOAD_SESSION.mount('https://', forever_adapter) DEFAULT_HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive" } USE_PYPPETEER = False try:
import time import requests from ricecooker.utils.caching import ( CacheControlAdapter, CacheForeverHeuristic, FileCache, InvalidatingCacheControlAdapter, ) sess = requests.Session() cache = FileCache(".webcache") forever_adapter = CacheControlAdapter(heuristic=CacheForeverHeuristic(), cache=cache) invalidate_adapter = InvalidatingCacheControlAdapter(cache=cache) sess.mount("http://www.khanacademy.org/api/v2/topics/topictree", forever_adapter) sess.mount("http://www.khanacademy.org/api/v1/assessment_items/", forever_adapter) sess.mount("https://api.crowdin.com", forever_adapter) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", } class Dummy404ResponseObject(requests.Response):
class BasicCrawler(object): """ Basic web crawler that uses the breadth first search to visit all pages of a website starting from the `MAIN_SOURCE_DOMAIN` and browing pages recursively. Every page visited is aware of the `parent` (referring page), which makes it possible to consturct a web resource tree that can later be used to construct a ricecooker json tree, and ultimately a Kolibri channel. """ BASE_IGNORE_URLS = [ 'javascript:void(0)', '#', re.compile('^mailto:.*'), re.compile('^javascript:.*'), ] MEDIA_FILE_FORMATS = ['pdf', 'zip', 'rar', 'mp4', 'mp3', 'm4a', 'ogg'] MEDIA_CONTENT_TYPES = [ 'application/pdf', 'application/zip', 'application/x-zip-compressed', 'application/octet-stream', 'video/mpeg', 'video/mp4', 'audio/vorbis', 'audio/mp3', 'audio/mpeg', 'image/png', 'image/jpeg', 'image/gif', 'application/msword', 'application/vnd.ms-excel', 'application/vnd.ms-powerpoint', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'application/vnd.openxmlformats-officedocument.presentationml.presentation', ] GLOBAL_NAV_THRESHOLD = 0.7 CRAWLING_STAGE_OUTPUT = 'chefdata/trees/web_resource_tree.json' # Subclass attributes MAIN_SOURCE_DOMAIN = None # should be defined by subclass SOURCE_DOMAINS = [] # should be defined by subclass START_PAGE = None # should be defined by subclass START_PAGE_CONTEXT = {} # should be defined by subclass IGNORE_URLS = [] # should be defined by subclass rules = [] # contains tuples (url_RE_pattern, handler_function) kind_handlers = { } # mapping from web resource kinds (user defined) and handlers # e.g. {'LesssonWebResource': self.on_lesson, ...} # CACHE LOGIC SESSION = requests.Session() CACHE = FileCache('.webcache') # queue used keep track of what pages we should crawl next queue = None # instance of queue.Queue created insite `crawrl` method # keep track of how many times a given URL is seen during crawl # first time a URL is seen will be automatically followed, but # subsequent occureces will record link existence but not recurse global_urls_seen_count = defaultdict( int) # DB of all urls that have ever been seen # { 'http://site.../fullpath?a=b#c': 3, ... } urls_visited = { } # 'http://site.../fullpath?a=b#c' --> cached version of html content def __init__(self, main_source_domain=None, start_page=None): if main_source_domain is None and start_page is None: raise ValueError( 'Need to specify main_source_domain or start_page.') if main_source_domain: self.MAIN_SOURCE_DOMAIN = main_source_domain.rstrip('/') self.START_PAGE = self.MAIN_SOURCE_DOMAIN if self.MAIN_SOURCE_DOMAIN is None: self.MAIN_SOURCE_DOMAIN = urlparse(start_page).netloc if self.MAIN_SOURCE_DOMAIN not in self.SOURCE_DOMAINS: self.SOURCE_DOMAINS.append(self.MAIN_SOURCE_DOMAIN) if start_page: self.START_PAGE = start_page # keep track of broken links self.broken_links = [] forever_adapter = CacheControlAdapter( heuristic=CacheForeverHeuristic(), cache=self.CACHE) for source_domain in self.SOURCE_DOMAINS: self.SESSION.mount( source_domain, forever_adapter ) # TODO: change to less aggressive in final version # GENERIC URL HELPERS ############################################################################ def cleanup_url(self, url): """ Removes URL fragment that falsely make URLs look diffent. Subclasses can overload this method to perform other URL-normalizations. """ url = urldefrag(url)[0] return url def url_to_path(self, url): """ Remove any of the SOURCE_DOMAINS from url if it starts with one of them. """ for source_domain in self.SOURCE_DOMAINS: if url.startswith(source_domain): path = url.replace(source_domain, '') return path return url def should_ignore_url(self, url): """ Returns True if `url` matches any of the IGNORE_URL criteria. """ url = self.cleanup_url(url) # 1. run through ignore lists combined_ignore_patterns = self.BASE_IGNORE_URLS.copy() combined_ignore_patterns.extend(self.IGNORE_URLS) for pattern in combined_ignore_patterns: if isinstance(pattern, str): if url == pattern: return True elif isinstance(pattern, re._pattern_type): if pattern.match(url): return True elif callable(pattern): if pattern(url): return True else: raise ValueError( 'Unrecognized pattern in IGNORE_URLS. Use strings, REs, or callables.' ) # 2. check if url is on one of the specified source domains found = False for source_domain in self.SOURCE_DOMAINS: if url.startswith(source_domain): found = True return not found # should ignore if not found in SOURCE_DOMAINS list def is_media_file(self, url): """ Makes a HEAD request for `url` and reuturns (vertict, head_response), where verdict is True if `url` points to a media file (.pdf, .docx, etc.) """ head_response = self.make_request(url, method='HEAD') if head_response: content_type = head_response.headers.get('content-type', None) if not content_type: LOGGER.warning( 'HEAD response does not have `content-type` header. url = ' + url) return (False, None) if content_type in self.MEDIA_CONTENT_TYPES: return (True, head_response) else: return (False, head_response) else: LOGGER.warning('HEAD request failed for url ' + url) # Fallback strategy: try to guess if media link based on extension for media_ext in self.MEDIA_FILE_FORMATS: if url.endswith('.' + media_ext): return (True, None) # if all else fails, assume False return (False, None) # CRAWLING TASK QUEUE API ############################################################################ # # queue tasks are tuples (url, context) where # - url (str): which page should be visited # - context (dict): generic container for data associated with url, notably # - `context['parent']` is the web resources dict of the referring page # - `context['kind']` can be used to assign a custom handler, e.g., on_course def queue_empty(self): return self.queue.empty() def get_url_and_context(self): return self.queue.get() def enqueue_url_and_context(self, url, context, force=False): # TODO(ivan): clarify crawl-only-once logic and use of force flag in docs url = self.cleanup_url(url) if url not in self.global_urls_seen_count.keys() or force: # LOGGER.debug('adding to queue: url=' + url) self.queue.put((url, context)) else: pass # LOGGER.debug('Not going to crawl url ' + url + 'beacause previously seen.') self.global_urls_seen_count[url] += 1 # BASIC PAGE HANDLER ############################################################################ def on_page(self, url, page, context): """ Basic handler that appends current page to parent's children list and adds all links on current page to the crawling queue. """ LOGGER.debug('in on_page ' + url) page_dict = dict( kind='PageWebResource', url=url, children=[], ) page_dict.update(context) # attach this page as another child in parent page context['parent']['children'].append(page_dict) links = page.find_all('a') for i, link in enumerate(links): if link.has_attr('href'): link_url = urljoin(url, link['href']) if self.should_ignore_url(link_url): pass # Uncomment three lines below for debugging to record ignored links # ignored_rsrc_dict = self.create_ignored_url_dict(link_url) # ignored_rsrc_dict['parent'] = page_dict # page_dict['children'].append(page_dict) else: self.enqueue_url_and_context(link_url, {'parent': page_dict}) else: pass # LOGGER.debug('a with no nohref found ' + str(link)) # MAIN LOOP ############################################################################ def crawl(self, limit=1000, save_web_resource_tree=True, devmode=True): # initialize or reset crawler state self.queue = queue.Queue() self.global_urls_seen_count = defaultdict(int) self.urls_visited = {} # add the start page to the crawling queue channel_dict = dict( url='This is a temp. outer container for the crawler channel tree.' 'Its unique child node is the web root.', kind='WEB_RESOURCE_TREE_CONTAINER', children=[], ) start_url = self.START_PAGE root_context = {'parent': channel_dict} if self.START_PAGE_CONTEXT: root_context.update(self.START_PAGE_CONTEXT) self.enqueue_url_and_context(start_url, root_context) counter = 0 while not self.queue_empty(): # 1. GET next url to crawl an its context dict original_url, context = self.get_url_and_context() # 2. Media files (PDF/ZIP/MP3) and broken link check verdict, head_response = self.is_media_file(original_url) if verdict == True: media_rsrc_dict = self.create_media_url_dict( original_url, head_response) media_rsrc_dict['parent'] = context['parent'] context['parent']['children'].append(media_rsrc_dict) continue # 3. Let's go GET that url url, page = self.download_page(original_url) if page is None: LOGGER.warning('GET ' + original_url + ' did not return page.') broken_link_dict = self.create_broken_link_url_dict( original_url) broken_link_dict['parent'] = context['parent'] context['parent']['children'].append(broken_link_dict) continue # cache BeatifulSoup parsed html in memory (because RAM is cheap!) self.urls_visited[original_url] = page # annotate context to keep track of URL befor redirects if url != original_url: context['original_url'] = original_url ########## HANDLER DISPATCH LOGIC ################################ handled = False # A. kind-handler based dispatch logic if 'kind' in context: kind = context['kind'] if kind in self.kind_handlers: handler = self.kind_handlers[kind] if callable(handler): handler(url, page, context) handled = True elif isinstance(handler, str) and hasattr(self, handler): handler_fn = getattr(self, handler) handler_fn(url, page, context) handled = True else: raise ValueError( 'Unrecognized handler type', handler, 'Should be method or name of method.') else: LOGGER.info('No handler registered for kind ' + str(kind) + ' so falling back to on_page handler.') # B. URL rules handler dispatlogic path = url.replace( self.MAIN_SOURCE_DOMAIN, '' ) # TODO: redo with urls instead of paths <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< for pat, handler_fn in self.rules: if pat.match(path): handler_fn(url, page, context['parent']) handled = True # if none of the above caught it, we use the default on_page handler if not handled: self.on_page(url, page, context) #################################################################### # limit crawling to 1000 pages unless otherwise told (failsafe default) counter += 1 if limit and counter > limit: break # remove parent links before output tree self.cleanup_web_resource_tree(channel_dict) # hoist entire tree one level up to get rid of the tmep. outer container channel_dict = channel_dict['children'][0] # Save output if save_web_resource_tree: self.write_web_resource_tree_json(channel_dict) # Display debug info if devmode: self.print_crawler_devmode(channel_dict) return channel_dict def download_page(self, url, *args, **kwargs): """ Download `url` (following redirects) and soupify response contents. Returns (final_url, page) where final_url is URL afrer following redirects. """ response = self.make_request(url, *args, **kwargs) if not response: return (None, None) html = response.text page = BeautifulSoup(html, "html.parser") LOGGER.debug('Downloaded page ' + str(url) + ' title:' + self.get_title(page)) return (response.url, page) def make_request(self, url, timeout=60, *args, method='GET', **kwargs): """ Failure-resistant HTTP GET/HEAD request helper method. """ retry_count = 0 max_retries = 5 while True: try: response = self.SESSION.request(method, url, *args, timeout=timeout, **kwargs) break except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e: retry_count += 1 LOGGER.warning( "Connection error ('{msg}'); about to perform retry {count} of {trymax}." .format(msg=str(e), count=retry_count, trymax=max_retries)) time.sleep(retry_count * 1) if retry_count >= max_retries: LOGGER.error("FAILED TO RETRIEVE:" + str(url)) return None if response.status_code != 200: LOGGER.error("ERROR " + str(response.status_code) + ' when getting url=' + url) return None return response # DEFAULT ACTIONS FOR MEDIA FILES AND BROKEN LINKS ############################################################################ def create_media_url_dict(self, original_url, head_response): """ Create metadata dict for media URL `original_url` using `head_response`. """ original_url_clean = self.cleanup_url(original_url) # before redirects media_rsrc_dict = dict( kind='MediaWebResource', url=original_url_clean, children=[], ) if head_response: url = self.cleanup_url( head_response.url) # URL after possible redirect media_rsrc_dict['url'] = url if url != original_url: media_rsrc_dict['original_url'] = original_url # content_type = head_response.headers.get('content-type', None) if content_type: media_rsrc_dict['content-type'] = content_type # TODO(ivan): resolve content-type to contenty type label using le-utils lookup # content_disposition = head_response.headers.get( 'content-disposition', None) if content_disposition: media_rsrc_dict['content-disposition'] = content_disposition # content_length = head_response.headers.get('content-length', None) if content_length: media_rsrc_dict['content-length'] = content_length # return media_rsrc_dict def create_broken_link_url_dict(self, url): """ Create a metadata dict for the broken link `url`. """ broken_link_dict = dict( kind='BrokenLink', url=url, children=[], ) self.broken_links.append(url) return broken_link_dict def create_ignored_url_dict(self, url): """ Create metadata link for a URL that matches one of self.IGNORE_URLS. """ ignored_url_dict = dict( kind='IgnoredUrl', url=url, children=[], ) return ignored_url_dict # WEB RESOURCE INFO UTILS (CRAWLER DEVMODE) ############################################################################ def print_crawler_devmode(self, channel_tree): """ Craweler devmode info useful during interactive development of the cralwer. """ print('\n\n\n') print('#' * 80) print('# CRAWLER RECOMMENDATIONS BASED ON URLS ENCOUNTERED:') print('#' * 80) print( '\n1. These URLs are very common and look like global navigation links:' ) global_nav_candidates = self.infer_gloabal_nav(channel_tree) for c in global_nav_candidates['children']: print(' - ', c['url']) print( '\n2. These are common path fragments found in URLs paths, so could correspond to site struture:' ) fragments_tuples = self.infer_tree_structure(channel_tree) for fpath, fcount in fragments_tuples: print(' - ', str(fcount), 'urls on site start with ', '/' + fpath) if len(self.broken_links) > 0: print( '\n3. These are broken links --- you might want to add them to IGNORE_URLS' ) print(self.broken_links) print('\n') print('#' * 80) print('\n\n') def infer_tree_structure(self, tree_root, show_top=10): """ Walk web resource tree and look for patterns in urls. Print the top 10 occurence of subpaths that are common to multiple URLs. E.g. if we see a lot of URLs like /pat/smth1 /pat/smth2 /pat/smth3, we'll identify `/pat` as a candidate for site structure: Returns ['/pat', ...] """ # Get URLs unique_urls = set() def recusive_visit_extract_urls(subtree): url = subtree['url'] if url not in unique_urls: unique_urls.add(url) for child in subtree['children']: recusive_visit_extract_urls(child) recusive_visit_extract_urls(tree_root) # Build path trie subpath_trie = {} def _add_parts_here(path_parts, here): if not path_parts: return else: part = path_parts.pop(0) if part not in here.keys(): here[part] = {} _add_parts_here(path_parts, here[part]) else: _add_parts_here(path_parts, here[part]) for url in unique_urls: path = self.url_to_path(url) path = path.split('?')[0] # rm query string path_parts = path.split('/')[1:] _add_parts_here(path_parts, subpath_trie) # annotate with counts def _recusive_count_children(here): if not here.keys(): return 1 count = 0 for subpath in here.keys(): count += _recusive_count_children(here[subpath]) return count path_count_tuples = [] for path, subtrie in subpath_trie.items(): count = _recusive_count_children(subtrie) path_count_tuples.append((path, count)) # top 10 sorted by count sorted_path_count_tuples = sorted(path_count_tuples, key=lambda t: t[1], reverse=True) return sorted_path_count_tuples[0:show_top] def compute_subtree_stats(self, subtree, counter=None): """ Recusively compute counts of different `kind` web sesources in subtree. """ if counter is None: counter = Counter() # don't count subtree itself, only its children else: counter[subtree['kind']] += 1 if 'children' in subtree: for child in subtree['children']: self.compute_subtree_stats(child, counter=counter) return counter def print_tree(self, tree_root, print_depth=3, hide_keys=[]): """ Print contents of web resource tree starting at `tree_root`. """ def print_web_resource_node(node, depth=1): INDENT_BY = 3 extra_attrs = '' if node is None: print('Encountered a None node in print_web_resource_node') return if 'kind' in node: extra_attrs = ' (' + node['kind'] + ') ' path = self.url_to_path( node['url']) # print paths instead of full URLs print(' ' * INDENT_BY * depth + ' -', 'path:', path, extra_attrs) if depth < print_depth: # recurse and print children if node['children']: print(' ' * INDENT_BY * depth + ' ', 'children:') for child in node['children']: print_web_resource_node(child, depth=depth + 1) else: # print only summary counts counts = self.compute_subtree_stats(node) if counts: counts_str = str(counts).replace('Counter', '').strip('()') print(' ' * INDENT_BY * depth + ' ', 'children counts:', counts_str) print_web_resource_node(tree_root) def infer_gloabal_nav(self, tree_root, debug=False): """ Returns a list of web resources that are likely to be global nav links. """ global_nav_nodes = dict( url=self.MAIN_SOURCE_DOMAIN, kind='GlobalNavLinks', children=[], ) # 1. infer global nav URLs based on total seen count / total pages visited total_urls_seen_count = len(self.urls_visited.keys()) def _is_likely_global_nav(url): """ Returns True if `url` is likely a global nav link based on how often seen in pages. """ seen_count = self.global_urls_seen_count[url] if debug: LOGGER.debug('seen_count/total_urls_seen_count=' + str(float(seen_count) / total_urls_seen_count) + '=' + str(seen_count) + '/' + str(total_urls_seen_count) + self.url_to_path(url)) # if previously determined to be a global nav link for global_nav_resource in global_nav_nodes['children']: if url == global_nav_resource['url']: return True # if new link that is seen a lot if float(seen_count ) / total_urls_seen_count > self.GLOBAL_NAV_THRESHOLD: return True return False def recusive_visit_find_global_nav_children(subtree): for child in subtree['children']: child_url = child['url'] if len(child['children']) == 0 and _is_likely_global_nav( child_url): LOGGER.debug('Found candidate for global nav url=' + str(child_url) + 'adding to global_nav_nodes') global_nav_resource = dict( kind='GlobalNavLink', url=child_url, ) global_nav_resource.update(child) global_nav_nodes['children'].append(global_nav_resource) # recurse recusive_visit_find_global_nav_children(child) recusive_visit_find_global_nav_children(tree_root) return global_nav_nodes def remove_global_nav(self, tree_root, global_nav_nodes): """ Walks web resource tree and removes all web resources whose URLs match nodes in global_nav_nodes['children']. This method is a helper for debugging. Your production crawler should use `self.IGNORE_URLS` to remove global nav links so won't crawl them at all. """ global_nav_urls = [d['url'] for d in global_nav_nodes['children']] def _recusive_visit_rm_global_nav_children(subtree): newchildren = [] for child in subtree['children']: child_url = child['url'] if len(child['children'] ) == 0 and child_url in global_nav_urls: LOGGER.info('Removing global nav url =' + child_url) else: clean_child = _recusive_visit_rm_global_nav_children(child) newchildren.append(clean_child) subtree['children'] = newchildren return subtree _recusive_visit_rm_global_nav_children(tree_root) def cleanup_web_resource_tree(self, tree_root): """ Remove nodes' parent links (otherwise tree is not json serializable). """ def cleanup_subtree(subtree): if 'parent' in subtree: del subtree['parent'] for child in subtree['children']: cleanup_subtree(child) cleanup_subtree(tree_root) return tree_root # TEXT HELPERS ############################################################################ def get_text(self, element): """ Extract text contents of `element`, normalizing newlines to spaces and stripping. """ if element is None: return '' else: return element.get_text().replace('\r', '').replace('\n', ' ').strip() def get_title(self, page): title = '' head_el = page.find('head') if head_el: title_el = head_el.find('title') if title_el: title = title_el.get_text().strip() return title # OUTPUT JSON ############################################################################ def write_web_resource_tree_json(self, channel_dict): destpath = self.CRAWLING_STAGE_OUTPUT parent_dir, _ = os.path.split(destpath) if not os.path.exists(parent_dir): os.makedirs(parent_dir, exist_ok=True) with open(destpath, 'w') as wrt_file: json.dump(channel_dict, wrt_file, indent=2)