def proxy_address(self, flow: http.HTTPFlow) -> typing.Tuple[str, int]: # Check if the URL is known to the CDX server playback = False # Use the canonicalised URL r_url = str(urlcanon.whatwg(urlcanon.parse_url(flow.request.url))) # Query the CDX service for this URL: ctx.log.info("checking %s..." % r_url) r = self.s.get('http://cdxserver:8080/fc', params={ 'url': r_url, 'sort': 'reverse', 'limit': 10 }) # Loop through response CDX lines: for cdxline in r.iter_lines(decode_unicode=True): cdx = cdxline.split(" ") # Compare canonicalised URLs (in case an intermediary e.g. adds a default :80 port) cdx_url = str(urlcanon.whatwg(urlcanon.parse_url(cdx[2]))) if r_url == cdx_url: ctx.log.info("MATCH") playback = True break else: ctx.log.info("NO MATCH '%s' '%s'" % (r_url, cdx_url)) # Either playback or record, depending on the outcome: if playback: ctx.log.info("PYWB") return ("pywb", 8080) else: ctx.log.info("WARCPROX") return ("warcprox", 8000)
def test_parser_idempotence(): path = os.path.join(os.path.dirname(__file__), '..', '..', 'testdata', 'idempotence.json') with open(path, 'rb') as f: inputs = load_json_bytes(f.read()) for s in inputs: assert urlcanon.parse_url(s).__bytes__() == s
def new_site(frontier, site): site.id = str(uuid.uuid4()) logging.info("new site {}".format(site)) # insert the Page into the database before the Site, to avoid situation # where a brozzler worker immediately claims the site, finds no pages # to crawl, and decides the site is finished try: url = urlcanon.parse_url(site.seed) hashtag = (url.hash_sign + url.fragment).decode("utf-8") urlcanon.canon.remove_fragment(url) page = brozzler.Page( frontier.rr, { "url": str(url), "site_id": site.get("id"), "job_id": site.get("job_id"), "hops_from_seed": 0, "priority": 1000, "needs_robots_check": True }) if hashtag: page.hashtags = [ hashtag, ] page.save() logging.info("queued page %s", page) finally: # finally block because we want to insert the Site no matter what site.save()
def from_url(cls, url): """Returns broken-down SURT from a URL. Arguments: url -- The URL to SURTify. Returns: A SURT broken down into its parts. """ return cls(parse_url(url).surt().decode('utf-8'))
def new_seed_page(frontier, site): url = urlcanon.parse_url(site.seed) hashtag = (url.hash_sign + url.fragment).decode("utf-8") urlcanon.canon.remove_fragment(url) page = brozzler.Page(frontier.rr, { "url": str(url), "site_id": site.get("id"), "job_id": site.get("job_id"), "hops_from_seed": 0, "priority": 1000, "needs_robots_check": True}) if hashtag: page.hashtags = [hashtag,] return page
def url_matches_domain_exactly(url, domain): ''' Returns true if - domain is an ip address and url.host is the same ip address - domain is a domain and url.host is the same domain Does not do any normalization/canonicalization. Probably a good idea to call `host_matches_domain( canonicalize(url), urlcanon.normalize_host(domain))`. ''' if not isinstance(url, urlcanon.ParsedUrl): url = urlcanon.parse_url(url) return host_matches_domain_exactly(url.host, domain)
def applies(self, url, parent_url=None): ''' Returns true if `url` matches `match_rule`. All conditions must match for a url to be considered a match. The caller should normally canonicalize before `url` and `parent_url` passing them to this method. Args: url (urlcanon.ParsedUrl or bytes or str): already canonicalized url parent_url (urlcanon.ParsedUrl or bytes or str, optional): parent url, should be supplied if the rule has a `parent_url_regex` Returns: bool: True if the rule matches, False otherwise ''' if not isinstance(url, urlcanon.ParsedUrl): url = urlcanon.parse_url(url) if self.domain: domain_test_fn = (url_matches_domain if not self.exact else url_matches_domain_exactly) if not domain_test_fn(url, self.domain): return False if self.surt: surt = url.surt() if not (surt == self.surt if self.exact else surt.startswith(self.surt)): return False if self.ssurt: surt = url.ssurt() if not (surt == self.ssurt if self.exact else surt.startswith(self.ssurt)): return False if self.substring and not url.__bytes__().find(self.substring) >= 0: return False if self.regex: if not self.regex.match(url.__bytes__()): return False if self.parent_url_regex: if not parent_url: return False if isinstance(parent_url, urlcanon.ParsedUrl): parent_url = parent_url.__bytes__() elif isinstance(parent_url, unicode): parent_url = parent_url.encode('utf-8') if not self.parent_url_regex.match(parent_url): return False return True
def test_w3c_test_data(input, href, test): url = urlcanon.parse_url(input) urlcanon.whatwg(url) assert test['protocol'].encode('utf-8') == (url.scheme + url.colon_after_scheme) assert test['username'].encode('utf-8') == url.username assert test['password'].encode('utf-8') == url.password assert test['host'].encode('utf-8') == url.host_port assert test['hostname'].encode('utf-8') == url.host assert test['pathname'].encode('utf-8') == url.path assert test['search'].encode('utf-8') == (url.query and (url.question_mark + url.query) or b'') assert test['hash'].encode('utf-8') == (url.fragment and (url.hash_sign + url.fragment) or b'') assert test['href'] == unicode(url)
def url_to_canon(self, url): parsed_url = urlcanon.parse_url(url) urlcanon.whatwg(parsed_url) parsed_url = str(parsed_url) if parsed_url.lower().endswith("index.html"): parsed_url = parsed_url[:parsed_url.index("index.html")] neki2 = parsed_url.rsplit('/', 1)[1] if '#' in neki2: parsed_url = parsed_url[:parsed_url.index("#")] if neki2 != '' and '.' not in neki2 and not neki2.endswith('/'): parsed_url += '/' parsed_url = urllib.parse.unquote(parsed_url) if parsed_url.count(':') == 1: ena, dva = parsed_url.split(':') if ' ' in dva: parsed_url = ena + ':' + urllib.parse.quote(dva) parsed_url = url_normalize.url_normalize(parsed_url) return parsed_url
def normalizeUrl(url, link_value): parsed_url_str = None for exc in PATH_EXCLUSIONS: if exc in url.path: return None if url.query or url.fragment or url.scheme == "mailto" or url.scheme == "tel" or url.scheme == "data" or url.scheme == "javascript": return None link_value = eliminateFromURL(link_value, EXTRAS) parsed_url = urlcanon.parse_url(link_value) urlcanon.whatwg(parsed_url) parsed_url_str = str(parsed_url) parsed_url_str = parsed_url_str.replace('//', '/') if parsed_url_str: if parsed_url_str[0] == '.': parsed_url_str = parsed_url_str[1:] if parsed_url_str[-1] == '/': parsed_url_str = parsed_url_str[:-1] return parsed_url_str
def add_articles(): Article.objects.delete(date__gte=(datetime.datetime.now() - datetime.timedelta(days=2))) idk = FeedModel.objects.all() for bar in idk: print(bar.url) foo = feedparser.parse(bar.url) for post in foo.entries: time.sleep(10) parsed_url = urlcanon.parse_url(post.link) og = OpenGraph(url=post.link) try: category = model.predict([post.title]) Article.objects.add_article(post.title, post.description, parsed_url, og.image, bar.title, category) logger.info("Article Added") except: logger.info("Did Not Work") continue
def prune_outlinks(dirty_links, block_list=None): ''' Filter for valid schemes, remove URL fragments, and drop any other designated URLs from the list. ''' links = set() dirty_links = set(dirty_links) self.logger.info('Pruning links...') for link in dirty_links: link = urlcanon.parse_url(link) if link.scheme in (b'http', b'https', b'ftp'): urlcanon.canon.remove_fragment(link) link = str(link).strip() links.add(link) self.logger.info('Pruning complete.') # Need to remove after link fragments have been removed to prevent duplication. self.logger.info('Removing Links: %s', ', '.join(block_list)) links = links.difference(block_list) return links
def test_parsing(input, parsed_fields): parsed_url = urlcanon.parse_url(input) assert parsed_url.leading_junk == parsed_fields[b'leading_junk'] assert parsed_url.scheme == parsed_fields[b'scheme'] assert parsed_url.colon_after_scheme == parsed_fields[ b'colon_after_scheme'] assert parsed_url.slashes == parsed_fields[b'slashes'] assert parsed_url.username == parsed_fields[b'username'] assert parsed_url.colon_before_password == parsed_fields[ b'colon_before_password'] assert parsed_url.password == parsed_fields[b'password'] assert parsed_url.at_sign == parsed_fields[b'at_sign'] assert parsed_url.ip6 == parsed_fields[b'ip6'] assert parsed_url.ip4 == parsed_fields[b'ip4'] assert parsed_url.host == parsed_fields[b'host'] assert parsed_url.colon_before_port == parsed_fields[b'colon_before_port'] assert parsed_url.port == parsed_fields[b'port'] assert parsed_url.path == parsed_fields[b'path'] assert parsed_url.question_mark == parsed_fields[b'question_mark'] assert parsed_url.query == parsed_fields[b'query'] assert parsed_url.hash_sign == parsed_fields[b'hash_sign'] assert parsed_url.fragment == parsed_fields[b'fragment'] assert parsed_url.trailing_junk == parsed_fields[b'trailing_junk']
def parse_record(path, node_id, edge_id, process_record, max_identifier_length, dt14): with open(path, "rb") as infile: # loop on every record in WAT for record in ArchiveIterator(infile): record_array = [] if record.rec_type != 'metadata': continue warc_target_uri = urlcanon.parse_url( record.rec_headers.get_header('WARC-Target-URI')) urlcanon.whatwg(warc_target_uri) # canonicalization # select only members whose WARC-Target-URI begins with "https?://" if not re.search("^https?://", str(warc_target_uri)) or len( str(warc_target_uri)) > max_identifier_length: continue dt = record.rec_headers.get_header('WARC-Date') if dt14: dt = dp.parse(dt).strftime('%Y%m%d%H%M%S') # construct node with timestamp (VersionNode) version_node = { "an": { node_id: { "identifier": str(warc_target_uri.ssurt(), encoding='utf-8'), "timestamp": dt, "TYPE": "VersionNode" } } } record_array.append(json.dumps(version_node)) record_array.append('\r\n') source_id = node_id node_id += 1 content = json.loads(record.raw_stream.read().decode('utf-8')) try: links = content["Envelope"]["Payload-Metadata"][ "HTTP-Response-Metadata"]["HTML-Metadata"]["Links"] except: links = '' # loop on links if not empty and get all urls if links != '': for link in links: # this is for empty outlink elements, maybe a bug in webarchive-commons used to generate WAT try: # convert relative outlink to absolute one url = urljoin(str(warc_target_uri), link["url"]) urlcanon.whatwg(url) # canonicalization # match only urls that begin with "https?://" if not re.search("^https?://", url) or len( str(url)) > max_identifier_length: continue # construct node and edge node = { "an": { node_id: { "identifier": str(urlcanon.parse_url(url).ssurt(), encoding="utf-8"), "TYPE": "Node" } } } edge = { "ae": { edge_id: { "directed": "true", "source": str(source_id), "target": str(node_id) } } } record_array.append(json.dumps(node)) record_array.append('\r\n') record_array.append(json.dumps(edge)) record_array.append('\r\n') node_id += 1 edge_id += 1 except: continue same_batch = process_record(record_array, node_id, edge_id) if not same_batch: node_id = edge_id = 1
def from_seeds(seed_list: List[str]) -> "Scope": new_list: Set[bytes] = set() for url in seed_list: surt = parse_url(url).surt(with_scheme=False) new_list.add(surt[0 : surt.index(surt_end) + 1]) return Scope(new_list)
def in_scope(self, url: str) -> bool: usurt = parse_url(url).surt(with_scheme=False) for surt in self.surts: if usurt.startswith(surt): return True return False
def test_semantic_precise(uncanonicalized, canonicalized): url = urlcanon.parse_url(uncanonicalized) urlcanon.semantic_precise(url) assert url.__bytes__() == canonicalized
def test_aggressive(uncanonicalized, canonicalized): url = urlcanon.parse_url(uncanonicalized) # if uncanonicalized == b' https://www.google.com/ ': # import pdb; pdb.set_trace() urlcanon.aggressive(url) assert url.__bytes__() == canonicalized
def test_google_canonicalizer(uncanonicalized, canonicalized): url = urlcanon.parse_url(uncanonicalized) urlcanon.google(url) assert url.__bytes__() == canonicalized
def test_surt_without_trailing_comma(url, surt): assert urlcanon.parse_url(url).surt(trailing_comma=False) == surt
def test_surt_without_scheme(url, surt): assert urlcanon.parse_url(url).surt(with_scheme=False) == surt
def test_surt(url, surt): assert urlcanon.parse_url(url).surt() == surt
def test_supplemental_whatwg(uncanonicalized, canonicalized): url = urlcanon.parse_url(uncanonicalized) urlcanon.whatwg(url) assert url.__bytes__() == canonicalized
def get_canonized_url(url): return urlcanon.parse_url(url)
def run(self): while not frontier.empty(): # get next url from frontier url = frontier.get() # parse url to get base url and domain name split_url = urlsplit(url) base = "{0.netloc}".format(split_url) domain = base.replace("www.", "") if "www." in base else base base_url = "{0.scheme}://{0.netloc}/".format(split_url) # first check if can access page canAccess = self.checkIPAccessTime(domain) if canAccess != None: if not canAccess: # return url to frontier and move on to the next url frontier.put(url) continue else: continue # check if site already saved robotLock.acquire() site = self.findSiteByDomain(domain) if site: robotLock.release() siteID = site[0] robot_content = site[2] else: # retrieve robots.txt content try: r = requests.get(parse.urljoin(base_url, 'robots.txt')) robot_content = None # if it exists, save it if r.status_code == requests.codes.ok: robot_content = r.text except(requests.exceptions.MissingSchema, requests.exceptions.ConnectionError, requests.exceptions.InvalidURL, requests.exceptions.InvalidSchema): robot_content = None # wait some time time.sleep(MINOR_TIMEOUT) # get sitemap.xml try: s = requests.get(parse.urljoin(base_url, 'sitemap.xml')) sitemap_content = None # if it exists save it if s.status_code == requests.codes.ok: sitemap_content = s.text except(requests.exceptions.MissingSchema, requests.exceptions.ConnectionError, requests.exceptions.InvalidURL, requests.exceptions.InvalidSchema): sitemap_content = None # wait some time time.sleep(MINOR_TIMEOUT) # save site siteID = self.insertSite(domain, robot_content, sitemap_content) robotLock.release() # create robot file parser object robot = robotexclusionrulesparser.RobotExclusionRulesParser() if robot_content: robot.parse(robot_content) # check if current url is allowed by robots.txt duplicatesLock.acquire() if not robot.is_allowed(USER_AGENT, url): pageID = self.findPageByUrl(url) self.deleteLinkByID(pageID) self.deletePageByUrl(url) duplicatesLock.release() continue duplicatesLock.release() # download content from url try: self.webDriver.get(url) time.sleep(TIMEOUT) except TimeoutException: # save timeout if pageID: # page already saved self.updatePage(pageID, siteID, PAGE_TIMEOUT, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, PAGE_TIMEOUT, url, None, req.response.status_code, datetime.now()) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue # retrieve request that loaded page req = None for request in self.webDriver.requests: if request.response and request.response.status_code >= 300 and request.response.status_code <= 399: continue if request.response and request.path == url: req = request break if request.response and request.response.status_code == requests.codes.ok: req = request break if req == None: for request in self.webDriver.requests: if request.response: if request.response.status_code == 403 or request.response.status_code == 503: req = request break if not req: req = self.webDriver.last_request # check page type and save page info pageID = self.findPageByUrl(url) if req and req.response: content_type = req.response.headers.get('Content-Type') if content_type: if "text/html" in content_type: # HTML page # check for canonical link try: canonicalLink = self.webDriver.find_element_by_xpath("//link[@rel='canonical']") if canonicalLink: link = canonicalLink.get_attribute('href') if link != url: # is duplicate duplicatesLock.acquire() # check if original page already saved originalPageID = self.findPageByUrl(link) if originalPageID: duplicatesLock.release() if pageID: # page already saved self.updatePage(pageID, None, DUPLICATE, None, None, datetime.now()) else: # save new page and remember id pageID = self.insertPage(None, DUPLICATE, None, None, None, datetime.now()) # add link to original page self.insertLink(pageID, originalPageID) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue else: # create blank page originalPageID = self.insertPage(None, FRONTIER, link, None, None, None) duplicatesLock.release() if pageID: # page already saved self.updatePage(pageID, None, DUPLICATE, None, None, datetime.now()) else: # save new page and remember id pageID = self.insertPage(None, DUPLICATE, None, None, None, datetime.now()) # add link to original page self.insertLink(pageID, originalPageID) # add url to frontier frontier.put(link) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue except(NoSuchElementException, StaleElementReferenceException): pass # check for duplicate content originalPageID = self.findPageByContent(self.webDriver.page_source) if originalPageID: # is duplicate if pageID: # page already saved self.updatePage(pageID, None, DUPLICATE, None, None, datetime.now()) else: # save new page and remember id pageID = self.insertPage(None, DUPLICATE, None, None, None, datetime.now()) # add link to original page self.insertLink(pageID, originalPageID) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue # not duplicate if pageID: # page already saved self.updatePage(pageID, siteID, FRONTIER_HTML, self.webDriver.page_source, req.response.status_code, datetime.now()) else: # save new page and remember id pageID = self.insertPage(siteID, FRONTIER_HTML, url, self.webDriver.page_source, req.response.status_code, datetime.now()) # let through only pages that loaded successfully if req.response.status_code != requests.codes.ok: del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "text/plain" in content_type: # TXT content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, TXT) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "application/pdf" in content_type: # PDF content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, PDF) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "application/msword" in content_type: # DOC content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, DOC) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "application/vnd.openxmlformats-officedocument.wordprocessingml.document" in content_type: # DOCX content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, DOCX) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "application/vnd.ms-powerpoint" in content_type: # PPT content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, PPT) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "application/vnd.openxmlformats-officedocument.presentationml.presentation" in content_type: # PPTX content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, PPTX) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "image" in content_type: # IMAGE content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # parse file name filename = urlparse(url) # insert image data self.insertImage(pageID, os.path.basename(filename.path), content_type, datetime.now()) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "text/css" in content_type: # CSS content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, CSS) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "text/csv" in content_type: # CSV content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, CSV) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue elif "application/zip" in content_type: # ZIP content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, ZIP) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue else: # unknown BINARY content if pageID: # page already saved self.updatePage(pageID, siteID, BINARY, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, BINARY, url, None, req.response.status_code, datetime.now()) # insert page data self.insertPageData(pageID, UNKNOWN) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue else: # no content header -> mark page as UNDEFINED if pageID: # page already saved self.updatePage(pageID, siteID, UNDEFINED, None, req.response.status_code, datetime.now()) else: # save new page pageID = self.insertPage(siteID, UNDEFINED, url, None, req.response.status_code, datetime.now()) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue else: # some kind of error happened if pageID: # page already saved self.updatePage(pageID, siteID, NO_RESPONSE, None, None, datetime.now()) else: # save new page pageID = self.insertPage(siteID, NO_RESPONSE, url, None, None, datetime.now()) # continue to next url in frontier del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") continue # only if page is of HTML type # extract links # href elements = self.webDriver.find_elements_by_xpath("//*[@href]") for element in elements: try: link = element.get_attribute('href') # check if url allowed by robots.txt and if is from .gov.si if self.isGov(link) and robot.is_allowed(USER_AGENT, link): # canonicalize url link = str(urlcanon.whatwg(urlcanon.parse_url(link))) # add url to frontier self.addUrlToFrontier(pageID, link) except(NoSuchElementException, StaleElementReferenceException): continue # onclick elements = self.webDriver.find_elements_by_xpath("//*[@onclick]") for element in elements: try: line = element.get_attribute('onclick') if line: link = "" if "location.href='" in line: rightLine = line.split("location.href='")[1] link = rightLine.split("'")[0] elif "document.location='" in line: rightLine = line.split("document.location='")[1] link = rightLine.split("'")[0] if link != "": # check if url allowed by robots.txt and if is from .gov.si if self.isGov(link) and robot.is_allowed(USER_AGENT, link): # canonicalize url link = str(urlcanon.whatwg(urlcanon.parse_url(link))) # add url to frontier self.addUrlToFrontier(pageID, link) except(NoSuchElementException, StaleElementReferenceException): continue # extract images elements = self.webDriver.find_elements_by_tag_name('img') for element in elements: try: link = element.get_attribute('src') # check if url allowed by robots.txt, if is from .gov.si and if src attribute has URL if self.isGov(link) and robot.is_allowed(USER_AGENT, link) and re.match(self.urlValidator, link): link = str(urlcanon.whatwg(urlcanon.parse_url(link))) self.addUrlToFrontier(pageID, link) except(NoSuchElementException, StaleElementReferenceException): continue del self.webDriver.requests print(f"Worker {self.threadID}: {url} done...") self.conn.close() self.webDriver.quit() print(f"Worker {self.threadID}: finished crawling.")
def gather_links(self): # Define Browser Options soup = BeautifulSoup(self.current_page_html, "lxml") # Extract links to profiles from TWDS Authors links = set() images = set() for link in soup.find_all("a"): current_url_relative = link.get('href') current_url = urllib.parse.urljoin(self.site_currently_crawling[1], current_url_relative) current_parsed_url_urlcanon = urlcanon.parse_url(current_url) urlcanon.whatwg(current_parsed_url_urlcanon) current_parsed_url = urllib.parse.urlparse(current_url) if (current_parsed_url.scheme != "http" and current_parsed_url.scheme != "https"): continue #print("uglyurl: ", current_url, "CANON: ", current_parsed_url_urlcanon, "current_parsed_url: ", current_parsed_url) # print("DOMAIN", self.site_currently_crawling[1]) # print(" URL------->", current_url, current_parsed_url.geturl()) links.add(current_parsed_url) onclicks = soup.find_all(attrs={'onclick': True}) if len(onclicks) > 0: for onclick in onclicks: try: x = onclick.find("location=") if (x < 0): continue onclick_split = onclick.split(onclick[x + 9]) for index, string in enumerate(onclick_split): if "location=" in string: loc = onclick_split[index + 1] print( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", loc) current_url = urllib.parse.urljoin( self.site_currently_crawling[1], loc) current_parsed_url_urlcanon = urlcanon.parse_url( current_url) urlcanon.whatwg(current_parsed_url_urlcanon) current_parsed_url = urllib.parse.urlparse( current_url) links.add(current_parsed_url) break except Exception: continue for image in soup.find_all("img"): current_url_relative = image.get('src') current_url = urllib.parse.urljoin(self.site_currently_crawling[1], current_url_relative) current_parsed_url = urllib.parse.urlparse(current_url) images.add(current_parsed_url) # print(images) for image in images: fullurl = urllib.parse.urljoin(self.site_currently_crawling[1], image.geturl()) fullurl = urllib.parse.urlparse(fullurl) try: res = requests.get(fullurl.geturl()) except Exception: continue content_type = res.headers['content-type'] content = res.content url = image.geturl() path = urllib.parse.urlparse(url).path filename = os.path.basename(path) db.insert_image(self.page_currently_crawling[0], filename, content_type, content, int(time.time())) return list(links)
def canonicalize(self, url): if not isinstance(url, urlcanon.ParsedUrl): url = urlcanon.parse_url(url) for step in self.steps: step(url) return url
def canon(s: str) -> str: parsed = urlcanon.parse_url(s) return str(urlcanon.whatwg(parsed))
def clean_url(s: str) -> str: s = s.strip() parsed = urlcanon.parse_url(s) if not parsed.port and parsed.colon_before_port: parsed.colon_before_port = b"" return str(urlcanon.whatwg(parsed))
def main(self): # The page contains HTML, lets scrape it -------------------------------------------------- firefox_options = FirefoxOptions() # Adding a specific user agent firefox_options.add_argument("user-agent=fri-ieps-kslk") firefox_options.add_argument("--headless") print(f"[PageHandler] Retrieving web page URL '{self.page_url}'") self.driver = webdriver.Firefox( options=firefox_options, executable_path=Config.WEB_DRIVER_LOCATION_GECKO) self.driver.set_page_load_timeout(10) self.driver.get(self.page_url) # Timeout needed for Web page to render (read more about it) time.sleep(Config.RENDERING_TIMEOUT) self.html_content = self.driver.page_source # Checking for duplicates ------------------------------------------------------------------ self.hashed_content = hashlib.md5( self.html_content.encode("utf-8")).hexdigest() is_duplicate = self.session.query(Page).filter( Page.content_hash == self.hashed_content).first() if is_duplicate: self.page_db.page_type_code = "DUPLICATE" self.page_db.http_status_code = self.status_code self.page_db.site_id = self.site_id self.page_db.url = self.page_url self.page_db.accessed_time = getTimestamp() self.page_db.content_hash = self.hashed_content self.session.commit() self.session.close() self.driver.quit() return # The page is valid html and its not a duplicate, now we extract all the links on the page --- links = [] # First, we extract the links with tag name "a" elems = self.driver.find_elements_by_tag_name("a") for elem in elems: href = elem.get_attribute('href') if href is None: continue if href.startswith("/"): links.append(self.base_url + href) elif href is not None and ("http" in href or "https" in href): links.append(href) # We also extract links from the onclick sections onclicks = self.driver.find_elements_by_xpath("//*[@onclick]") for el in onclicks: temp = el.get_attribute("onclick") if "location.href=" in temp: temp = temp.replace("location.href=", "")\ .replace("\'", "")\ .replace("\"", "") links.append(temp) # Remove the links that point outside of .gov links_trancuted = [] for el in links: if "gov.si/" in el: links_trancuted.append(el) links = links_trancuted # Put the links in the canonical form links_canonical = [] for el in links: parsed_link = urlcanon.parse_url(el) urlcanon.whatwg(parsed_link) links_canonical.append(str(parsed_link)) links = links_canonical # Save the links to the DB ----------------------------------------------------------------- for link in links: # Check if link is already in the DB is_duplicate = self.session.query(Page).filter( Page.url == link).first() if is_duplicate is None: extracted_domain_name = get_domain_name_from_url(link) page = Page() page.site_id = self.get_site_id_for_page(extracted_domain_name) # Pages with status == None have yet to be visited page.status = None page.page_type_code = "FRONTIER" page.url = link self.session.add(page) self.session.commit() # Also add a Link to the DB link_ = Link() link_.from_page = self.page_id link_.to_page = self.session.query(Page).filter( Page.url == link).first().id self.session.add(link_) self.session.commit() #else: # print(f"Page {link} is already in the DB") # Finding and storing the images on the page -------------------------------------------------- imgs = self.driver.find_elements_by_tag_name("img") for elem in imgs: src = elem.get_attribute("src") url = "" if src is None: continue if src.startswith("/"): url = self.base_url + src elif src is not None and ("http" in src or "https" in src): url = src if url != "" and len(url) <= 255: # Save the image image = Image() image.page_id = self.page_id image.filename = url image.content_type = "BINARY" image.accessed_time = getTimestamp() self.session.add(image) self.session.commit() # With all the data scraped, we can save the page to the DB ------------------------------------- self.page_db.html_content = self.html_content self.page_db.accessed_time = getTimestamp() self.page_db.content_hash = self.hashed_content self.page_db.http_status_code = self.status_code self.page_db.site_id = self.site_id self.page_db.page_type_code = "HTML" self.page_db.url = self.page_url self.session.commit() # Lets be responsible and close the session and the driver self.session.close() self.driver.quit()