def _discover_page_link_data(self, url, html_body): #Prepare to store any url parameters present in links on the page self.url_data[url]['urlparams'] = set() #We're also storing unique links acessible from the given page self.url_data[url]['accessible_links'] = set() all_links = set( filter(lambda url: validate_url(url, self.source_url), html_body.xpath("//a/@href"))) for link in all_links: absolute_link = self._generate_absolute_link(link) #we want our accessible links to be links without url parameters self.url_data[url]['accessible_links'].add(absolute_link) #get the url parameters from the url and store them in the data #structure urlparams = get_url_params(absolute_link) self.url_data[url]['urlparams'].update(urlparams) all_links = set( map(lambda url: trim_url_params(self._generate_absolute_link(url)), all_links)) for link in (all_links - self.discovered_urls): self.discovered_urls.add(link) self.urlqueue.append(link)
def remove_arc_map_service_from_uri(self, uri): if validate_url(uri): ams = ([a for a in self._arc_map_servers if a.uri == uri] + [None])[0] if ams is not None: self.remove_arc_map_server(ams) else: warn("not a valid Url: {}".format(uri))
def shorten_url(): body = request.get_json() if not isinstance(body, dict) or 'url' not in body: return {'ok': False, 'reason': 'Invalid body'}, 400 if not validate_url(body['url']): return {'ok': False, 'reason': 'Malformed URL'}, 400 suffix = rand_str(5) short_url = '{proto}://{base}/go/{suffix}'.format(proto=PROTO, base=BASE, suffix=suffix) _redis.set(suffix, body['url']) return {'ok': True, 'url': short_url}
def runTestTwo(self): self.prt("\n>>> STARTING TEST TWO - Dead links checker") self.prt(f"[{TARGET_DOMAIN}] Parsing href, src links", end="\r") self.scraped_elements_for_url[ TARGET_DOMAIN] = self.parse_all_page_elements( TARGET_DOMAIN, '[href],[src]') if not self.scraped_elements_for_url[TARGET_DOMAIN]: self.prt(f"[ERROR] Could not parse sublinks. Moving forward") links = [] for a in self.scraped_elements_for_url[TARGET_DOMAIN]: link = {} try: link['raw'] = a['attributes']['href'] except: try: link['raw'] = a['attributes']['src'] # Check if link comes from Base64 encoded string if 'data:' in link['raw']: continue except: continue # Check if link is not an URL. It may be a route! if parse_domain(link['raw']) != parse_domain(TARGET_DOMAIN): # They're not the same domain. This link may be a route to the same website, let's check parsed = validate_url(link['raw']) # if True is returned, then it's fine. Otherwise, it may be a route, so let's append it to the TARGET DOMAIN if parsed != True: if parsed[0:2] == "//": link['formatted'] = "https:" + parsed else: link['formatted'] = parsed.replace('//', '/') link['formatted'] = TARGET_DOMAIN + \ "/" + link['formatted'] else: continue else: link['formatted'] = link['raw'] links.append(link) # Ignore links containing pre-configured keywords ignore_current = False for i in range(0, len(TEST_TWO_IGNORE_KEYWORDS)): if TEST_TWO_IGNORE_KEYWORDS[i] in link['raw']: ignore_current = True break if ignore_current: continue total_links_tested = 0 errors = [] total_links = len(links) for i in range(0, total_links): # Iterate over the found elements and perform loading test self.prt(f"Testing link {i} / {total_links}", end="\r") if request_success_url(URL=links[i]['formatted']): total_links_tested += 1 else: self.dead_links_found.append(links[i]['raw']) total_errors = len(self.dead_links_found) if total_errors > 0: for i in range(0, total_errors): self.prt(f"[ERROR] DEAD {self.dead_links_found[i]}") else: self.prt(f"[OK] Links tested: {total_links_tested}")
def runTestFour(self): self.prt("\n>>> STARTING TEST FOUR - Insecure Content Links Checker") self.prt(f"[{TARGET_DOMAIN}] Parsing Webpage Links", end="\r") links = [] # this test will run on all subpages plus target_domain # check if links have already been parsed by previous tasks if self.scraped_elements_for_url[TARGET_DOMAIN]: found_elements = get_elements_by_selector(TARGET_DOMAIN, '[href], [src]') if found_elements is None or len(found_elements) < 1: self.prt( f"[ERROR] Could not parse any page links. Moving forward to next test" ) return for i in range(0, len(found_elements)): # Iterate over the found elements and perform loading test cur = found_elements[i] link = None lt = "" if 'href' in cur['attributes']: link = cur['attributes']['href'] elif 'src' in cur['attributes']: link = cur['attributes']['src'] lt = "src" else: continue links.append(link) else: links = self.first_page_alive_hrefs + self.first_page_alive_srcs total_links_tested = 0 errors = [] warnings = [] total_links = len(links) for i in range(0, total_links): link = links[i] # Ignore links containing pre-configured keywords ignore_current = False for i in range(0, len(TEST_FOUR_IGNORE_KEYWORDS)): if TEST_FOUR_IGNORE_KEYWORDS[i] in link: ignore_current = True break if ignore_current: continue if parse_domain(link) != parse_domain(TARGET_DOMAIN): # They're not the same domain. This link may be a route to the same website, let's check parsed = validate_url(link) if parsed: link = TARGET_DOMAIN + f"/{parsed}" if not "https" in link: # HTTPS link was not found. Check if it's HTTP, otherwise throw a warning that none of them was found if 'http' in link: errors.append(f"[ERROR] FOUND HTTP LINK ({link})") else: warnings.append(f"[WARNING] UNRECOGNIZED LINK ({link})") errors = [] total_links_tested += 1 total_errors = len(errors) total_warnings = len(warnings) if total_errors > 0: for i in range(0, total_errors): self.prt(errors[i]) if total_warnings > 0: for i in range(0, total_warnings): self.prt(warnings[i]) if not total_errors: self.prt(f"[OK] Links tested: {total_links_tested}")
def runTestThree(self): self.prt(">>> STARTING TEST THREE ( Sub-pages loading time test )") page_elements = [] # Check if there are any DOM elements parsed for current page if TARGET_DOMAIN not in self.scraped_elements_for_url: self.prt(f"[{TARGET_DOMAIN}] Parsing href links", end="\r") page_elements = get_elements_by_selector(TARGET_DOMAIN, '[href]') if page_elements is None or len(page_elements) < 1: self.prt( f"[ERROR] Could not parse any page links. Moving forward to next test" ) return self.scraped_elements_for_url[TARGET_DOMAIN] = page_elements else: page_elements = self.scraped_elements_for_url[TARGET_DOMAIN] # Get links from DOM elements links = [] for i in range(0, len(page_elements)): try: cur = page_elements[i] link = None if 'href' in cur['attributes']: link = cur['attributes']['href'] else: continue except: continue links.append(link) total_links = len(links) errors = [] for i in range(0, total_links): link = links[i] # Ignore DEAD links must_skip = True if link in self.dead_links_found else False if must_skip: print("Skipping from dead link") continue # Skip IGNORE keyords for test three for ignore in TEST_THREE_IGNORE_KEYWORDS: if ignore in link: must_skip = True if must_skip: print("Skipping from ignore list") continue # Validate the link and check if they're on the same domain if parse_domain(link) != parse_domain(TARGET_DOMAIN): # They're not the same domain. This link may be a route to the same website, let's check parsed = validate_url(link) # if True is returned, then it's fine. Otherwise, it may be a route, so let's append it to the TARGET DOMAIN if parsed != True: if parsed[0:2] == "//": link = "https:" + parsed else: link = parsed.replace('//', '/') link = TARGET_DOMAIN + "/" + link else: continue pretty_print = f"/{parse_path(link)}" average_load_time = perform_load_test( link, 5, onPerformingRetry=lambda retry: self.prt( f"[{pretty_print}] Performing loading time test {retry} / {TEST_TWO_MAX_RETRIES}", end="\r")) if average_load_time <= TEST_TWO_MAX_LOADING_SECONDS: self.prt( f"[OK] ({pretty_print}) Average load time: {average_load_time}s" ) else: self.prt( f"[ERROR] ({pretty_print}) Average load time exceeded with: {average_load_time}s" )
def remove_arc_service_from_uri(self, uri): if not validate_url(uri): raise Exception("Uri for arc service is not valid") arc_s = ([s for s in self.arc_services if s.uri == uri] + [None])[0] if arc_s is not None: self.remove_arc_service(arc_s)
def add_arc_service_from_uri(self, uri): if not validate_url(uri): raise Exception("Uri for arc service is not valid") name = uri.split("/")[-1] arc_s = arc_service.ArcService(uri, name, self, self.db_client) self.add_arc_service(arc_s)