def get_canonical_url(url, whitelist, expandlist, extract, timeout=REQ_TIMEOUT): '''Get the canonical (or open graph) URL Returns a 4-tuple (original_url, new_url, method, reason) where method in ['canonical', 'redirect', 'original'] ''' method = 'original' ret_url = url # if it's not unicode, it must be utf8, otherwise fail url_new = url_or_error(url) if url_new is None: return {'url_original': url, 'url_retrieved': None, 'method': method, 'reason': 'invalid url'} url = url_new # Only download URLs that are in the WHITELIST or in the EXPANDLIST if not (check_whitelist(url, method, extract, expandlist) or check_whitelist(url, method, extract, whitelist)): return {'url_original': url, 'url_retrieved': ret_url, 'method': method, 'reason': 'not in lists'} # # fetch page # page, enc, final_url, err = get_web_page(url, timeout) # check final url from dowload attempt if final_url is not None: if not isinstance(final_url, unicode): final_url = final_url.decode('utf8') if final_url != url: ret_url = url_or_error(final_url) if ret_url is not None: method = 'redirect' logging.debug('got redirect') else: method = 'bad url' else: ret_url = final_url # check if whitelist exists if (ret_url is None or not check_whitelist(ret_url, method, extract, whitelist)): return {'url_original': url, 'url_retrieved': ret_url, 'method': method, 'reason': 'not in whitelist'} return process_page(page, enc, url, ret_url, method)
def processed_handler(data): url, page, enc, final_url, err = data ret_url = None method = 'original' # check final url from dowload attempt if final_url is None: result = {'url_original': url, 'url_retrieved': None, 'method': None, 'reason': 'unreachable'} canonical_handler(result) return if final_url != url: ret_url = url_or_error(final_url) method = 'redirect' msg = 'got redirect: {} -> {}'.format(url, final_url) logging.debug(msg) else: ret_url = url # check if whitelist exists if not check_whitelist(ret_url, method, extract, whitelist): result = {'url_original': url, 'url_retrieved': ret_url, 'method': None, 'reason': 'not in whitelist'} canonical_handler(result) return result = process_page(page, enc, url, ret_url, method) canonical_handler(result)
def get_web_page(url, timeout): ''' Fetches content at a given URL. Requests implementation. Args: url - unicode string Returns: (data, enc, final_url, None) or (None, None, None, Reason) on error. ''' url = url_or_error(url) if url is None: return (None, None, None, 'url') reason = 'unk' # default error resason # Download and Processs try: req = requests.get(url, timeout=timeout, allow_redirects=True, stream=True) req.raise_for_status() # Get Response URL final_url = req.url # Get Encoding enc = req.encoding # Get content-type content_type = req.headers.get('content-type') if content_type and 'text/html' not in content_type: msg = 'content type not supported %s for %s' % (content_type, url) logging.debug(msg) return (None, enc, final_url, 'content-type') # get data data = req.content return (data, enc, final_url, None) except requests.exceptions.Timeout: msg = 'timedout: {}'.format(url) logging.debug(msg) reason = 'timeout' except requests.exceptions.HTTPError: msg = 'download failed: url=%s reason: %d' % (url, req.status_code) logging.debug(msg) reason = str(req.status_code) except Exception as ex: msg = 'download failed: url=%s with %s' % (url, repr(ex)) logging.debug(msg) reason = 'download' return (None, None, None, reason)
def extract_canonical(unicode_content): """Extracts canonical URL or Open Graph URL from the content""" try: soup = BeautifulSoup(unicode_content, "html5lib") except FeatureNotFound: logging.exception("missing html5lib?") raise except Exception as ex: logging.exception(ex) return None except Exception: pass # Try Canonical URL try: url_can = soup.find("link", rel="canonical") if url_can: url_new = url_can.get("href") if url_new: return url_or_error(url_new) except Exception: pass # Try Open Graph try: url_can = soup.find("meta", attrs={"property": "og:url", "content": True}) if url_can: u = url_can["content"] if u: return url_or_error(u) except Exception: pass # logging.debug('no canonical url found') return None
def get_web_page_async(url, timeout, maxsize, maxclients, processed_handler): ''' Fetches content at a given URL. Requests implementation. Args: url - unicode string Returns: (data, enc, final_url, None) or (None, None, None, Reason) on error. ''' url = url_or_error(url) if url is None: processed_handler((None, None, None, 'url')) http_client = AsyncHTTPClient(max_clients=maxclients, max_buffer_size=maxsize) # Download and Processs handle_request = make_request_handler(processed_handler) http_client.fetch(url, handle_request)
def get_canonical_url_async(url, whitelist, expandlist, extract, timeout, maxsize, maxclients, canonical_handler): '''Get the canonical (or open graph) URL Returns a 4-tuple (original_url, new_url, method, reason) where method in ['canonical', 'redirect', 'original'] ''' method = 'original' ret_url = url # if it's not unicode, it must be utf8, otherwise fail url_new = url_or_error(url) if url_new is None: result = {'url_original': url, 'url_retrieved': None, 'method': method, 'reason': 'invalid url'} canonical_handler(result) return url = url_new # Only download URLs that are in the WHITELIST or in the EXPANDLIST if not (check_whitelist(url, method, extract, expandlist) or check_whitelist(url, method, extract, whitelist)): result = {'url_original': url, 'url_retrieved': ret_url, 'method': method, 'reason': 'not in lists'} logging.debug('passing result 1') canonical_handler(result) return # fetch page processed_handler = make_processed_handler(canonical_handler, whitelist, extract) get_web_page_async(url, timeout, maxsize, maxclients, processed_handler)