def proxy_address(self, flow: http.HTTPFlow) -> typing.Tuple[str, int]: # Check if the URL is known to the CDX server playback = False # Use the canonicalised URL r_url = str(urlcanon.whatwg(urlcanon.parse_url(flow.request.url))) # Query the CDX service for this URL: ctx.log.info("checking %s..." % r_url) r = self.s.get('http://cdxserver:8080/fc', params={ 'url': r_url, 'sort': 'reverse', 'limit': 10 }) # Loop through response CDX lines: for cdxline in r.iter_lines(decode_unicode=True): cdx = cdxline.split(" ") # Compare canonicalised URLs (in case an intermediary e.g. adds a default :80 port) cdx_url = str(urlcanon.whatwg(urlcanon.parse_url(cdx[2]))) if r_url == cdx_url: ctx.log.info("MATCH") playback = True break else: ctx.log.info("NO MATCH '%s' '%s'" % (r_url, cdx_url)) # Either playback or record, depending on the outcome: if playback: ctx.log.info("PYWB") return ("pywb", 8080) else: ctx.log.info("WARCPROX") return ("warcprox", 8000)
def _build_fresh_page(self, site, parent_page, url, hops_off=0): url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode('utf-8') urlcanon.canon.remove_fragment(url_for_crawling) page = brozzler.Page( self.rr, { 'url': str(url_for_crawling), 'site_id': site.id, 'job_id': site.job_id, 'hops_from_seed': parent_page.hops_from_seed + 1, 'hop_path': str(parent_page.hop_path if parent_page.hop_path else "") + "L", 'via_page_id': parent_page.id, 'via_page_url': parent_page.url, 'hops_off_surt': hops_off, 'hashtags': [hashtag] if hashtag else [] }) return page
def _scope_and_enforce_robots(self, site, parent_page, outlinks): ''' Returns tuple ( dict of {page_id: Page} of fresh `brozzler.Page` representing in scope links accepted by robots policy, set of in scope urls (canonicalized) blocked by robots policy, set of out-of-scope urls (canonicalized)). ''' pages = {} # {page_id: Page, ...} blocked = set() out_of_scope = set() for url in outlinks or []: url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) decision = site.accept_reject_or_neither( url_for_scoping, parent_page=parent_page) if decision is True: hops_off = 0 elif decision is None: decision = parent_page.hops_off < site.scope.get( 'max_hops_off', 0) hops_off = parent_page.hops_off + 1 if decision is True: if brozzler.is_permitted_by_robots(site, str(url_for_crawling)): fresh_page = self._build_fresh_page( site, parent_page, url, hops_off) if fresh_page.id in pages: self._merge_page(pages[fresh_page.id], fresh_page) else: pages[fresh_page.id] = fresh_page else: blocked.add(str(url_for_crawling)) else: out_of_scope.add(str(url_for_crawling)) return pages, blocked, out_of_scope
def extract_outlinks(self, timeout=60): self.logger.info('extracting outlinks') self.websock_thread.expect_result(self._command_id.peek()) js = brozzler.jinja2_environment().get_template( 'extract-outlinks.js').render() msg_id = self.send_to_chrome(method='Runtime.evaluate', params={'expression': js}) self._wait_for(lambda: self.websock_thread.received_result(msg_id), timeout=timeout) message = self.websock_thread.pop_result(msg_id) if ('result' in message and 'result' in message['result'] and 'value' in message['result']['result']): if message['result']['result']['value']: out = [] for link in message['result']['result']['value'].split('\n'): try: out.append(str(urlcanon.whatwg(link))) except AddressValueError: self.logger.warning('skip invalid outlink: %s', link) return frozenset(out) else: # no links found return frozenset() else: self.logger.error( 'problem extracting outlinks, result message: %s', message) return frozenset()
def _scope_and_enforce_robots(self, site, parent_page, outlinks): ''' Returns tuple ( dict of {page_id: Page} of fresh `brozzler.Page` representing in scope links accepted by robots policy, set of in scope urls (canonicalized) blocked by robots policy, set of out-of-scope urls (canonicalized)). ''' pages = {} # {page_id: Page, ...} blocked = set() out_of_scope = set() for url in outlinks or []: url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) decision = site.accept_reject_or_neither( url_for_scoping, parent_page=parent_page) if decision is True: hops_off = 0 elif decision is None: decision = parent_page.hops_off < site.scope.get( 'max_hops_off', 0) hops_off = parent_page.hops_off + 1 if decision is True: if brozzler.is_permitted_by_robots(site, str(url_for_crawling)): fresh_page = self._build_fresh_page( site, parent_page, url, hops_off) if fresh_page.id in pages: self._merge_page(pages[fresh_page.id], fresh_page) else: pages[fresh_page.id] = fresh_page else: blocked.add(str(url_for_crawling)) else: out_of_scope.add(str(url_for_crawling)) return pages, blocked, out_of_scope
def _build_fresh_pages(self, site, parent_page, urls): ''' Returns a dict of page_id => brozzler.Page. ''' pages = {} for url in urls: url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode('utf-8') urlcanon.canon.remove_fragment(url_for_crawling) if not url_for_scoping.surt().startswith( site.scope['surt'].encode('utf-8')): hops_off_surt = parent_page.hops_off_surt + 1 else: hops_off_surt = 0 page = brozzler.Page( self.rr, { 'url': str(url_for_crawling), 'site_id': site.id, 'job_id': site.job_id, 'hops_from_seed': parent_page.hops_from_seed + 1, 'via_page_id': parent_page.id, 'hops_off_surt': hops_off_surt, 'hashtags': [] }) if page.id in pages: pages[page.id].priority += page.priority page = pages[page.id] else: pages[page.id] = page if hashtag: page.hashtags = list(set(page.hashtags + [hashtag])) return pages
def test_w3c_test_data(input, href, test): url = urlcanon.parse_url(input) urlcanon.whatwg(url) assert test['protocol'].encode('utf-8') == (url.scheme + url.colon_after_scheme) assert test['username'].encode('utf-8') == url.username assert test['password'].encode('utf-8') == url.password assert test['host'].encode('utf-8') == url.host_port assert test['hostname'].encode('utf-8') == url.host assert test['pathname'].encode('utf-8') == url.path assert test['search'].encode('utf-8') == (url.query and (url.question_mark + url.query) or b'') assert test['hash'].encode('utf-8') == (url.fragment and (url.hash_sign + url.fragment) or b'') assert test['href'] == unicode(url)
def url_to_canon(self, url): parsed_url = urlcanon.parse_url(url) urlcanon.whatwg(parsed_url) parsed_url = str(parsed_url) if parsed_url.lower().endswith("index.html"): parsed_url = parsed_url[:parsed_url.index("index.html")] neki2 = parsed_url.rsplit('/', 1)[1] if '#' in neki2: parsed_url = parsed_url[:parsed_url.index("#")] if neki2 != '' and '.' not in neki2 and not neki2.endswith('/'): parsed_url += '/' parsed_url = urllib.parse.unquote(parsed_url) if parsed_url.count(':') == 1: ena, dva = parsed_url.split(':') if ' ' in dva: parsed_url = ena + ':' + urllib.parse.quote(dva) parsed_url = url_normalize.url_normalize(parsed_url) return parsed_url
def visit_hashtags(self, page_url, hashtags, outlinks): _hashtags = set(hashtags or []) for outlink in outlinks: url = urlcanon.whatwg(outlink) hashtag = (url.hash_sign + url.fragment).decode('utf-8') urlcanon.canon.remove_fragment(url) if hashtag and str(url) == page_url: _hashtags.add(hashtag) # could inject a script that listens for HashChangeEvent to figure # out which hashtags were visited already and skip those for hashtag in _hashtags: # navigate_to_hashtag (nothing to wait for so no timeout?) self.logger.debug('navigating to hashtag %s', hashtag) url = urlcanon.whatwg(page_url) url.hash_sign = b'#' url.fragment = hashtag[1:].encode('utf-8') self.send_to_chrome( method='Page.navigate', params={'url': str(url)}) time.sleep(5) # um.. wait for idleness or something?
def visit_hashtags(self, page_url, hashtags, outlinks): _hashtags = set(hashtags or []) for outlink in outlinks: url = urlcanon.whatwg(outlink) hashtag = (url.hash_sign + url.fragment).decode('utf-8') urlcanon.canon.remove_fragment(url) if hashtag and str(url) == page_url: _hashtags.add(hashtag) # could inject a script that listens for HashChangeEvent to figure # out which hashtags were visited already and skip those for hashtag in _hashtags: # navigate_to_hashtag (nothing to wait for so no timeout?) self.logger.debug('navigating to hashtag %s', hashtag) url = urlcanon.whatwg(page_url) url.hash_sign = b'#' url.fragment = hashtag[1:].encode('utf-8') self.send_to_chrome( method='Page.navigate', params={'url': str(url)}) time.sleep(5) # um.. wait for idleness or something?
def normalizeUrl(url, link_value): parsed_url_str = None for exc in PATH_EXCLUSIONS: if exc in url.path: return None if url.query or url.fragment or url.scheme == "mailto" or url.scheme == "tel" or url.scheme == "data" or url.scheme == "javascript": return None link_value = eliminateFromURL(link_value, EXTRAS) parsed_url = urlcanon.parse_url(link_value) urlcanon.whatwg(parsed_url) parsed_url_str = str(parsed_url) parsed_url_str = parsed_url_str.replace('//', '/') if parsed_url_str: if parsed_url_str[0] == '.': parsed_url_str = parsed_url_str[1:] if parsed_url_str[-1] == '/': parsed_url_str = parsed_url_str[:-1] return parsed_url_str
def test_url_matches_domain(): assert urlcanon.url_matches_domain('http://1.2.3.4/', '1.2.3.4') assert urlcanon.url_matches_domain(b'scheme://1.2.3.4', '1.2.3.4') assert urlcanon.url_matches_domain('ftp://1.2.3.4/a/b/c/d', b'1.2.3.4') assert urlcanon.url_matches_domain(b'http://1.2.3.4', b'1.2.3.4') assert urlcanon.url_matches_domain( 'http://foo.example.com', 'example.com') assert not urlcanon.url_matches_domain( 'http://example.com', 'foo.example.com') assert not urlcanon.url_matches_domain( 'http://foo.EXAMPLE.COM', 'example.com') assert urlcanon.url_matches_domain( urlcanon.whatwg('http://foo.EXAMPLE.COM'), 'example.com') assert not urlcanon.url_matches_domain('http://☃.net', 'xn--n3h.net') assert urlcanon.url_matches_domain('http://☃.net', '☃.net') assert urlcanon.url_matches_domain('http://😬.☃.net', '☃.net') assert not urlcanon.url_matches_domain( 'http://😬.☃.net', urlcanon.normalize_host('☃.net')) assert urlcanon.url_matches_domain( urlcanon.whatwg('https://😬.☃.net'), urlcanon.normalize_host('☃.net'))
def _build_fresh_page(self, site, parent_page, url, hops_off=0): url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode('utf-8') urlcanon.canon.remove_fragment(url_for_crawling) page = brozzler.Page(self.rr, { 'url': str(url_for_crawling), 'site_id': site.id, 'job_id': site.job_id, 'hops_from_seed': parent_page.hops_from_seed + 1, 'via_page_id': parent_page.id, 'hops_off_surt': hops_off, 'hashtags': [hashtag] if hashtag else []}) return page
def _try_youtube_dl(self, ydl, site, page): try: self.logger.info("trying youtube-dl on {}".format(page)) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "<urlopen error # no host given>" resulting in ProxyError # needs automated test info = ydl.extract_info(str(urlcanon.whatwg(page.url))) self._remember_videos(page, ydl.brozzler_spy) # logging.info('XXX %s', json.dumps(info)) if self._using_warcprox(site): info_json = json.dumps(info, sort_keys=True, indent=4) self.logger.info( "sending WARCPROX_WRITE_RECORD request to warcprox " "with youtube-dl json for %s", page) self._warcprox_write_record( warcprox_address=self._proxy_for(site), url="youtube-dl:%s" % str(urlcanon.semantic(page.url)), warc_type="metadata", content_type= "application/vnd.youtube-dl_formats+json;charset=utf-8", payload=info_json.encode("utf-8"), extra_headers=site.extra_headers()) except brozzler.ShutdownRequested as e: raise except BaseException as e: if hasattr( e, "exc_info" ) and e.exc_info[0] == youtube_dl.utils.UnsupportedError: pass elif (hasattr(e, "exc_info") and e.exc_info[0] == urllib.error.HTTPError and hasattr(e.exc_info[1], "code") and e.exc_info[1].code == 420): raise brozzler.ReachedLimit(e.exc_info[1]) elif (hasattr(e, 'exc_info') and e.exc_info[0] == urllib.error.URLError and self._proxy_for(site)): # connection problem when using a proxy == proxy error (XXX?) raise brozzler.ProxyError( 'youtube-dl hit apparent proxy error from ' '%s' % page.url) from e else: raise
def _try_youtube_dl(worker, ydl, site, page): try: logging.info("trying yt-dlp on %s", page) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "<urlopen error # no host given>" resulting in ProxyError # needs automated test # and yt-dlp needs sanitize_info for extract_info ie_result = ydl.sanitize_info(ydl.extract_info(str(urlcanon.whatwg(page.url)))) _remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups) if worker._using_warcprox(site): info_json = json.dumps(ie_result, sort_keys=True, indent=4) logging.info( "sending WARCPROX_WRITE_RECORD request to warcprox " "with yt-dlp json for %s", page) worker._warcprox_write_record( warcprox_address=worker._proxy_for(site), url="youtube-dl:%s" % str(urlcanon.semantic(page.url)), warc_type="metadata", content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", payload=info_json.encode("utf-8"), extra_headers=site.extra_headers(page)) return ie_result except brozzler.ShutdownRequested as e: raise except Exception as e: if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError: return None elif (hasattr(e, "exc_info") and e.exc_info[0] == urllib.error.HTTPError and hasattr(e.exc_info[1], "code") and e.exc_info[1].code == 420): raise brozzler.ReachedLimit(e.exc_info[1]) elif (hasattr(e, 'exc_info') and e.exc_info[0] == urllib.error.URLError and worker._proxy_for(site)): # connection problem when using a proxy == proxy error (XXX?) raise brozzler.ProxyError( 'yt-dlp hit apparent proxy error from ' '%s' % page.url) from e else: raise
def _try_youtube_dl(worker, ydl, site, page): try: logging.info("trying youtube-dl on %s", page) with brozzler.thread_accept_exceptions(): # we do whatwg canonicalization here to avoid "<urlopen error # no host given>" resulting in ProxyError # needs automated test ie_result = ydl.extract_info(str(urlcanon.whatwg(page.url))) _remember_videos(page, ydl.fetch_spy.fetches, ydl.stitch_ups) if worker._using_warcprox(site): info_json = json.dumps(ie_result, sort_keys=True, indent=4) logging.info( "sending WARCPROX_WRITE_RECORD request to warcprox " "with youtube-dl json for %s", page) worker._warcprox_write_record( warcprox_address=worker._proxy_for(site), url="youtube-dl:%s" % str(urlcanon.semantic(page.url)), warc_type="metadata", content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", payload=info_json.encode("utf-8"), extra_headers=site.extra_headers()) return ie_result except brozzler.ShutdownRequested as e: raise except Exception as e: if hasattr(e, "exc_info") and e.exc_info[0] == youtube_dl.utils.UnsupportedError: return None elif (hasattr(e, "exc_info") and e.exc_info[0] == urllib.error.HTTPError and hasattr(e.exc_info[1], "code") and e.exc_info[1].code == 420): raise brozzler.ReachedLimit(e.exc_info[1]) elif (hasattr(e, 'exc_info') and e.exc_info[0] == urllib.error.URLError and worker._proxy_for(site)): # connection problem when using a proxy == proxy error (XXX?) raise brozzler.ProxyError( 'youtube-dl hit apparent proxy error from ' '%s' % page.url) from e else: raise
def _scope_and_enforce_robots(self, site, parent_page, outlinks): ''' Returns tuple ( set of in scope urls (uncanonicalized) accepted by robots policy, set of in scope urls (canonicalized) blocked by robots policy, set of out-of-scope urls (canonicalized)). ''' in_scope = set() blocked = set() out_of_scope = set() for url in outlinks or []: url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) urlcanon.canon.remove_fragment(url_for_crawling) if site.is_in_scope(url_for_scoping, parent_page=parent_page): if brozzler.is_permitted_by_robots(site, str(url_for_crawling)): in_scope.add(url) else: blocked.add(str(url_for_crawling)) else: out_of_scope.add(str(url_for_crawling)) return in_scope, blocked, out_of_scope
def run(self): while not frontier.empty(): # get next url from frontier url = frontier.get() # parse url to get base url and domain name split_url = urlsplit(url) base = "{0.netloc}".format(split_url) domain = base.replace("www.", "") if "www." in base else base base_url = "{0.scheme}://{0.netloc}/".format(split_url) # first check if can access page canAccess = self.checkIPAccessTime(domain) if canAccess != None: if not canAccess: # return url to frontier and move on to the next url frontier.put(url) continue else: continue # check if site already saved robotLock.acquire() site = self.findSiteByDomain(domain) if site: robotLock.release() siteID = site[0] robot_content = site[2] else: # retrieve robots.txt content try: r = requests.get(parse.urljoin(base_url, 'robots.txt')) robot_content = None # if it exists, save it if r.status_code == requests.codes.ok: robot_content = r.text except(requests.exceptions.MissingSchema, requests.exceptions.ConnectionError, requests.exceptions.InvalidURL, requests.exceptions.InvalidSchema): robot_content = None # wait some time time.sleep(MINOR_TIMEOUT) # get sitemap.xml try: s = requests.get(parse.urljoin(base_url, 'sitemap.xml')) sitemap_content = None # if it exists save it if s.status_code == requests.codes.ok: sitemap_content = s.text except(requests.exceptions.MissingSchema, requests.exceptions.ConnectionError, requests.exceptions.InvalidURL, requests.exceptions.InvalidSchema): sitemap_content = None # wait some time time.sleep(MINOR_TIMEOUT) # save site siteID = self.insertSite(domain, robot_content, sitemap_content) robotLock.release() # create robot file parser object robot = robotexclusionrulesparser.RobotExclusionRulesParser() if robot_content: robot.parse(robot_content) # check if current url is allowed by robots.txt duplicatesLock.acquire() if not robot.is_allowed(USER_AGENT, url): pageID = self.findPageByUrl(url) self.deleteLinkByID(pageID) self.deletePageByUrl(url) duplicatesLock.release() continue duplicatesLock.release() # download content from url try: self.webDriver.get(url) time.sleep(TIMEOUT) except TimeoutException: # save timeout if pageID: # page already saved self.updatePage(pageID, siteID, PAGE_TIMEOUT, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, PAGE_TIMEOUT, url, None, req.response.status_code, datetime.now()) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue # retrieve request that loaded page req = None for request in self.webDriver.requests: if request.response and request.response.status_code >= 300 and request.response.status_code <= 399: continue if request.response and request.path == url: req = request break if request.response and request.response.status_code == requests.codes.ok: req = request break if req == None: for request in self.webDriver.requests: if request.response: if request.response.status_code == 403 or request.response.status_code == 503: req = request break if not req: req = self.webDriver.last_request # check page type and save page info pageID = self.findPageByUrl(url) if req and req.response: content_type = req.response.headers.get('Content-Type') if content_type: if "text/html" in content_type: # HTML page # check for canonical link try: canonicalLink = self.webDriver.find_element_by_xpath("//link[@rel='canonical']") if canonicalLink: link = canonicalLink.get_attribute('href') if link != url: # is duplicate duplicatesLock.acquire() # check if original page already saved originalPageID = self.findPageByUrl(link) if originalPageID: duplicatesLock.release() if pageID: # page already saved self.updatePage(pageID, None, DUPLICATE, None, None, datetime.now()) else: # save new page and remember id pageID = self.insertPage(None, DUPLICATE, None, None, None, datetime.now()) # add link to original page self.insertLink(pageID, originalPageID) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue else: # create blank page originalPageID = self.insertPage(None, FRONTIER, link, None, None, None) duplicatesLock.release() if pageID: # page already saved self.updatePage(pageID, None, DUPLICATE, None, None, datetime.now()) else: # save new page and remember id pageID = self.insertPage(None, DUPLICATE, None, None, None, datetime.now()) # add link to original page self.insertLink(pageID, originalPageID) # add url to frontier frontier.put(link) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue except(NoSuchElementException, StaleElementReferenceException): pass # check for duplicate content originalPageID = self.findPageByContent(self.webDriver.page_source) if originalPageID: # is duplicate if pageID: # page already saved self.updatePage(pageID, None, DUPLICATE, None, None, datetime.now()) else: # save new page and remember id pageID = self.insertPage(None, DUPLICATE, None, None, None, datetime.now()) # add link to original page self.insertLink(pageID, originalPageID) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue # not duplicate if pageID: # page already saved self.updatePage(pageID, siteID, FRONTIER_HTML, self.webDriver.page_source, req.response.status_code, datetime.now()) else: # save new page and remember id pageID = self.insertPage(siteID, FRONTIER_HTML, url, self.webDriver.page_source, req.response.status_code, datetime.now()) # let through only pages that loaded successfully if req.response.status_code != requests.codes.ok: del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "text/plain" in content_type: # TXT content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, TXT) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "application/pdf" in content_type: # PDF content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, PDF) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "application/msword" in content_type: # DOC content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, DOC) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "application/vnd.openxmlformats-officedocument.wordprocessingml.document" in content_type: # DOCX content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, DOCX) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "application/vnd.ms-powerpoint" in content_type: # PPT content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, PPT) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "application/vnd.openxmlformats-officedocument.presentationml.presentation" in content_type: # PPTX content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, PPTX) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "image" in content_type: # IMAGE content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # parse file name filename = urlparse(url) # insert image data self.insertImage(pageID, os.path.basename(filename.path), content_type, datetime.now()) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "text/css" in content_type: # CSS content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, CSS) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "text/csv" in content_type: # CSV content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, CSV) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "application/zip" in content_type: # ZIP content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, ZIP) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue else: # unknown BINARY content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, UNKNOWN) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue else: # no content header -> mark page as UNDEFINED if pageID: # page already saved self.updatePage(pageID, siteID, UNDEFINED, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, UNDEFINED, url, None, req.response.status_code, datetime.now()) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue else: # some kind of error happened if pageID: # page already saved self.updatePage(pageID, siteID, NO_RESPONSE, None, None, datetime.now()) else: # save new page pageID = self.insertPage(siteID, NO_RESPONSE, url, None, None, datetime.now()) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue # only if page is of HTML type # extract links # href elements = self.webDriver.find_elements_by_xpath("//*[@href]") for element in elements: try: link = element.get_attribute('href') # check if url allowed by robots.txt and if is from .gov.si if self.isGov(link) and robot.is_allowed(USER_AGENT, link): # canonicalize url link = str(urlcanon.whatwg(urlcanon.parse_url(link))) # add url to frontier self.addUrlToFrontier(pageID, link) except(NoSuchElementException, StaleElementReferenceException): continue # onclick elements = self.webDriver.find_elements_by_xpath("//*[@onclick]") for element in elements: try: line = element.get_attribute('onclick') if line: link = "" if "location.href='" in line: rightLine = line.split("location.href='")[1] link = rightLine.split("'")[0] elif "document.location='" in line: rightLine = line.split("document.location='")[1] link = rightLine.split("'")[0] if link != "": # check if url allowed by robots.txt and if is from .gov.si if self.isGov(link) and robot.is_allowed(USER_AGENT, link): # canonicalize url link = str(urlcanon.whatwg(urlcanon.parse_url(link))) # add url to frontier self.addUrlToFrontier(pageID, link) except(NoSuchElementException, StaleElementReferenceException): continue # extract images elements = self.webDriver.find_elements_by_tag_name('img') for element in elements: try: link = element.get_attribute('src') # check if url allowed by robots.txt, if is from .gov.si and if src attribute has URL if self.isGov(link) and robot.is_allowed(USER_AGENT, link) and re.match(self.urlValidator, link): link = str(urlcanon.whatwg(urlcanon.parse_url(link))) self.addUrlToFrontier(pageID, link) except(NoSuchElementException, StaleElementReferenceException): continue del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") self.conn.close() self.webDriver.quit() print(f"Worker {self.threadID}: finished crawling.")
def clean_url(s: str) -> str: s = s.strip() parsed = urlcanon.parse_url(s) if not parsed.port and parsed.colon_before_port: parsed.colon_before_port = b"" return str(urlcanon.whatwg(parsed))
def test_supplemental_whatwg(uncanonicalized, canonicalized): url = urlcanon.parse_url(uncanonicalized) urlcanon.whatwg(url) assert url.__bytes__() == canonicalized
def parse_record(path, node_id, edge_id, process_record, max_identifier_length, dt14): with open(path, "rb") as infile: # loop on every record in WAT for record in ArchiveIterator(infile): record_array = [] if record.rec_type != 'metadata': continue warc_target_uri = urlcanon.parse_url( record.rec_headers.get_header('WARC-Target-URI')) urlcanon.whatwg(warc_target_uri) # canonicalization # select only members whose WARC-Target-URI begins with "https?://" if not re.search("^https?://", str(warc_target_uri)) or len( str(warc_target_uri)) > max_identifier_length: continue dt = record.rec_headers.get_header('WARC-Date') if dt14: dt = dp.parse(dt).strftime('%Y%m%d%H%M%S') # construct node with timestamp (VersionNode) version_node = { "an": { node_id: { "identifier": str(warc_target_uri.ssurt(), encoding='utf-8'), "timestamp": dt, "TYPE": "VersionNode" } } } record_array.append(json.dumps(version_node)) record_array.append('\r\n') source_id = node_id node_id += 1 content = json.loads(record.raw_stream.read().decode('utf-8')) try: links = content["Envelope"]["Payload-Metadata"][ "HTTP-Response-Metadata"]["HTML-Metadata"]["Links"] except: links = '' # loop on links if not empty and get all urls if links != '': for link in links: # this is for empty outlink elements, maybe a bug in webarchive-commons used to generate WAT try: # convert relative outlink to absolute one url = urljoin(str(warc_target_uri), link["url"]) urlcanon.whatwg(url) # canonicalization # match only urls that begin with "https?://" if not re.search("^https?://", url) or len( str(url)) > max_identifier_length: continue # construct node and edge node = { "an": { node_id: { "identifier": str(urlcanon.parse_url(url).ssurt(), encoding="utf-8"), "TYPE": "Node" } } } edge = { "ae": { edge_id: { "directed": "true", "source": str(source_id), "target": str(node_id) } } } record_array.append(json.dumps(node)) record_array.append('\r\n') record_array.append(json.dumps(edge)) record_array.append('\r\n') node_id += 1 edge_id += 1 except: continue same_batch = process_record(record_array, node_id, edge_id) if not same_batch: node_id = edge_id = 1
def canon(s: str) -> str: parsed = urlcanon.parse_url(s) return str(urlcanon.whatwg(parsed))
def gather_links(self): # Define Browser Options soup = BeautifulSoup(self.current_page_html, "lxml") # Extract links to profiles from TWDS Authors links = set() images = set() for link in soup.find_all("a"): current_url_relative = link.get('href') current_url = urllib.parse.urljoin(self.site_currently_crawling[1], current_url_relative) current_parsed_url_urlcanon = urlcanon.parse_url(current_url) urlcanon.whatwg(current_parsed_url_urlcanon) current_parsed_url = urllib.parse.urlparse(current_url) if (current_parsed_url.scheme != "http" and current_parsed_url.scheme != "https"): continue #print("uglyurl: ", current_url, "CANON: ", current_parsed_url_urlcanon, "current_parsed_url: ", current_parsed_url) # print("DOMAIN", self.site_currently_crawling[1]) # print(" URL------->", current_url, current_parsed_url.geturl()) links.add(current_parsed_url) onclicks = soup.find_all(attrs={'onclick': True}) if len(onclicks) > 0: for onclick in onclicks: try: x = onclick.find("location=") if (x < 0): continue onclick_split = onclick.split(onclick[x + 9]) for index, string in enumerate(onclick_split): if "location=" in string: loc = onclick_split[index + 1] print( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", loc) current_url = urllib.parse.urljoin( self.site_currently_crawling[1], loc) current_parsed_url_urlcanon = urlcanon.parse_url( current_url) urlcanon.whatwg(current_parsed_url_urlcanon) current_parsed_url = urllib.parse.urlparse( current_url) links.add(current_parsed_url) break except Exception: continue for image in soup.find_all("img"): current_url_relative = image.get('src') current_url = urllib.parse.urljoin(self.site_currently_crawling[1], current_url_relative) current_parsed_url = urllib.parse.urlparse(current_url) images.add(current_parsed_url) # print(images) for image in images: fullurl = urllib.parse.urljoin(self.site_currently_crawling[1], image.geturl()) fullurl = urllib.parse.urlparse(fullurl) try: res = requests.get(fullurl.geturl()) except Exception: continue content_type = res.headers['content-type'] content = res.content url = image.geturl() path = urllib.parse.urlparse(url).path filename = os.path.basename(path) db.insert_image(self.page_currently_crawling[0], filename, content_type, content, int(time.time())) return list(links)
def scope_and_schedule_outlinks(self, site, parent_page, outlinks): decisions = {"accepted": set(), "blocked": set(), "rejected": set()} counts = {"added": 0, "updated": 0, "rejected": 0, "blocked": 0} for url in outlinks or []: url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode('utf-8') urlcanon.canon.remove_fragment(url_for_crawling) if site.is_in_scope(url_for_scoping, parent_page=parent_page): if brozzler.is_permitted_by_robots(site, str(url_for_crawling)): if not url_for_scoping.surt().startswith( site.scope["surt"].encode("utf-8")): hops_off_surt = parent_page.hops_off_surt + 1 else: hops_off_surt = 0 new_child_page = brozzler.Page( self.rr, { 'url': str(url_for_crawling), 'site_id': site.id, 'job_id': site.job_id, 'hops_from_seed': parent_page.hops_from_seed + 1, 'via_page_id': parent_page.id, 'hops_off_surt': hops_off_surt }) existing_child_page = brozzler.Page.load( self.rr, new_child_page.id) if existing_child_page: existing_child_page.priority += new_child_page.priority if hashtag and existing_child_page.hashtags: hashtags = set(existing_child_page.hashtags) hashtags.add(hashtag) existing_child_page.hashtags = list(hashtags) elif hashtag: existing_child_page.hashtags = [hashtag] existing_child_page.save() counts["updated"] += 1 else: if hashtag: new_child_page.hashtags = [ hashtag, ] new_child_page.save() counts["added"] += 1 decisions["accepted"].add(str(url_for_crawling)) else: counts["blocked"] += 1 decisions["blocked"].add(str(url_for_crawling)) else: counts["rejected"] += 1 decisions["rejected"].add(str(url_for_crawling)) parent_page.outlinks = {} for k in decisions: parent_page.outlinks[k] = list(decisions[k]) parent_page.save() self.logger.info( "%s new links added, %s existing links updated, %s links " "rejected, %s links blocked by robots from %s", counts["added"], counts["updated"], counts["rejected"], counts["blocked"], parent_page)
def main(self): # The page contains HTML, lets scrape it -------------------------------------------------- firefox_options = FirefoxOptions() # Adding a specific user agent firefox_options.add_argument("user-agent=fri-ieps-kslk") firefox_options.add_argument("--headless") print(f"[PageHandler] Retrieving web page URL '{self.page_url}'") self.driver = webdriver.Firefox( options=firefox_options, executable_path=Config.WEB_DRIVER_LOCATION_GECKO) self.driver.set_page_load_timeout(10) self.driver.get(self.page_url) # Timeout needed for Web page to render (read more about it) time.sleep(Config.RENDERING_TIMEOUT) self.html_content = self.driver.page_source # Checking for duplicates ------------------------------------------------------------------ self.hashed_content = hashlib.md5( self.html_content.encode("utf-8")).hexdigest() is_duplicate = self.session.query(Page).filter( Page.content_hash == self.hashed_content).first() if is_duplicate: self.page_db.page_type_code = "DUPLICATE" self.page_db.http_status_code = self.status_code self.page_db.site_id = self.site_id self.page_db.url = self.page_url self.page_db.accessed_time = getTimestamp() self.page_db.content_hash = self.hashed_content self.session.commit() self.session.close() self.driver.quit() return # The page is valid html and its not a duplicate, now we extract all the links on the page --- links = [] # First, we extract the links with tag name "a" elems = self.driver.find_elements_by_tag_name("a") for elem in elems: href = elem.get_attribute('href') if href is None: continue if href.startswith("/"): links.append(self.base_url + href) elif href is not None and ("http" in href or "https" in href): links.append(href) # We also extract links from the onclick sections onclicks = self.driver.find_elements_by_xpath("//*[@onclick]") for el in onclicks: temp = el.get_attribute("onclick") if "location.href=" in temp: temp = temp.replace("location.href=", "")\ .replace("\'", "")\ .replace("\"", "") links.append(temp) # Remove the links that point outside of .gov links_trancuted = [] for el in links: if "gov.si/" in el: links_trancuted.append(el) links = links_trancuted # Put the links in the canonical form links_canonical = [] for el in links: parsed_link = urlcanon.parse_url(el) urlcanon.whatwg(parsed_link) links_canonical.append(str(parsed_link)) links = links_canonical # Save the links to the DB ----------------------------------------------------------------- for link in links: # Check if link is already in the DB is_duplicate = self.session.query(Page).filter( Page.url == link).first() if is_duplicate is None: extracted_domain_name = get_domain_name_from_url(link) page = Page() page.site_id = self.get_site_id_for_page(extracted_domain_name) # Pages with status == None have yet to be visited page.status = None page.page_type_code = "FRONTIER" page.url = link self.session.add(page) self.session.commit() # Also add a Link to the DB link_ = Link() link_.from_page = self.page_id link_.to_page = self.session.query(Page).filter( Page.url == link).first().id self.session.add(link_) self.session.commit() #else: # print(f"Page {link} is already in the DB") # Finding and storing the images on the page -------------------------------------------------- imgs = self.driver.find_elements_by_tag_name("img") for elem in imgs: src = elem.get_attribute("src") url = "" if src is None: continue if src.startswith("/"): url = self.base_url + src elif src is not None and ("http" in src or "https" in src): url = src if url != "" and len(url) <= 255: # Save the image image = Image() image.page_id = self.page_id image.filename = url image.content_type = "BINARY" image.accessed_time = getTimestamp() self.session.add(image) self.session.commit() # With all the data scraped, we can save the page to the DB ------------------------------------- self.page_db.html_content = self.html_content self.page_db.accessed_time = getTimestamp() self.page_db.content_hash = self.hashed_content self.page_db.http_status_code = self.status_code self.page_db.site_id = self.site_id self.page_db.page_type_code = "HTML" self.page_db.url = self.page_url self.session.commit() # Lets be responsible and close the session and the driver self.session.close() self.driver.quit()