예제 #1
0
    def _discover_page_link_data(self, url, html_body):
        #Prepare to store any url parameters present in links on the page
        self.url_data[url]['urlparams'] = set()
        #We're also storing unique links acessible from the given page
        self.url_data[url]['accessible_links'] = set()
        all_links = set(
            filter(lambda url: validate_url(url, self.source_url),
                   html_body.xpath("//a/@href")))

        for link in all_links:
            absolute_link = self._generate_absolute_link(link)

            #we want our accessible links to be links without url parameters
            self.url_data[url]['accessible_links'].add(absolute_link)

            #get the url parameters from the url and store them in the data
            #structure
            urlparams = get_url_params(absolute_link)
            self.url_data[url]['urlparams'].update(urlparams)

        all_links = set(
            map(lambda url: trim_url_params(self._generate_absolute_link(url)),
                all_links))
        for link in (all_links - self.discovered_urls):
            self.discovered_urls.add(link)
            self.urlqueue.append(link)
예제 #2
0
 def remove_arc_map_service_from_uri(self, uri):
     if validate_url(uri):
         ams = ([a for a in self._arc_map_servers if a.uri == uri] +
                [None])[0]
         if ams is not None:
             self.remove_arc_map_server(ams)
     else:
         warn("not a valid Url: {}".format(uri))
예제 #3
0
def shorten_url():
    body = request.get_json()

    if not isinstance(body, dict) or 'url' not in body:
        return {'ok': False, 'reason': 'Invalid body'}, 400

    if not validate_url(body['url']):
        return {'ok': False, 'reason': 'Malformed URL'}, 400

    suffix = rand_str(5)
    short_url = '{proto}://{base}/go/{suffix}'.format(proto=PROTO,
                                                      base=BASE,
                                                      suffix=suffix)

    _redis.set(suffix, body['url'])

    return {'ok': True, 'url': short_url}
예제 #4
0
    def runTestTwo(self):
        self.prt("\n>>> STARTING TEST TWO - Dead links checker")
        self.prt(f"[{TARGET_DOMAIN}] Parsing href, src links", end="\r")

        self.scraped_elements_for_url[
            TARGET_DOMAIN] = self.parse_all_page_elements(
                TARGET_DOMAIN, '[href],[src]')
        if not self.scraped_elements_for_url[TARGET_DOMAIN]:
            self.prt(f"[ERROR] Could not parse sublinks. Moving forward")

        links = []
        for a in self.scraped_elements_for_url[TARGET_DOMAIN]:
            link = {}
            try:
                link['raw'] = a['attributes']['href']

            except:
                try:
                    link['raw'] = a['attributes']['src']
                    # Check if link comes from Base64 encoded string
                    if 'data:' in link['raw']:
                        continue

                except:
                    continue

            # Check if link is not an URL. It may be a route!
            if parse_domain(link['raw']) != parse_domain(TARGET_DOMAIN):
                # They're not the same domain. This link may be a route to the same website, let's check
                parsed = validate_url(link['raw'])

                # if True is returned, then it's fine. Otherwise, it may be a route, so let's append it to the TARGET DOMAIN
                if parsed != True:
                    if parsed[0:2] == "//":
                        link['formatted'] = "https:" + parsed
                    else:
                        link['formatted'] = parsed.replace('//', '/')
                        link['formatted'] = TARGET_DOMAIN + \
                         "/" + link['formatted']

                else:
                    continue
            else:
                link['formatted'] = link['raw']

            links.append(link)

            # Ignore links containing pre-configured keywords
            ignore_current = False
            for i in range(0, len(TEST_TWO_IGNORE_KEYWORDS)):
                if TEST_TWO_IGNORE_KEYWORDS[i] in link['raw']:
                    ignore_current = True
                    break
            if ignore_current:
                continue

        total_links_tested = 0
        errors = []
        total_links = len(links)

        for i in range(0, total_links):

            # Iterate over the found elements and perform loading test

            self.prt(f"Testing link {i} / {total_links}", end="\r")

            if request_success_url(URL=links[i]['formatted']):
                total_links_tested += 1
            else:
                self.dead_links_found.append(links[i]['raw'])

        total_errors = len(self.dead_links_found)

        if total_errors > 0:
            for i in range(0, total_errors):
                self.prt(f"[ERROR] DEAD {self.dead_links_found[i]}")
        else:
            self.prt(f"[OK] Links tested: {total_links_tested}")
예제 #5
0
    def runTestFour(self):
        self.prt("\n>>> STARTING TEST FOUR - Insecure Content Links Checker")
        self.prt(f"[{TARGET_DOMAIN}] Parsing Webpage Links", end="\r")

        links = []

        # this test will run on all subpages plus target_domain
        # check if links have already been parsed by previous tasks
        if self.scraped_elements_for_url[TARGET_DOMAIN]:
            found_elements = get_elements_by_selector(TARGET_DOMAIN,
                                                      '[href], [src]')
            if found_elements is None or len(found_elements) < 1:
                self.prt(
                    f"[ERROR] Could not parse any page links. Moving forward to next test"
                )
                return
            for i in range(0, len(found_elements)):
                # Iterate over the found elements and perform loading test
                cur = found_elements[i]

                link = None
                lt = ""
                if 'href' in cur['attributes']:
                    link = cur['attributes']['href']
                elif 'src' in cur['attributes']:
                    link = cur['attributes']['src']
                    lt = "src"
                else:
                    continue

                links.append(link)
        else:
            links = self.first_page_alive_hrefs + self.first_page_alive_srcs

        total_links_tested = 0
        errors = []
        warnings = []
        total_links = len(links)
        for i in range(0, total_links):

            link = links[i]
            # Ignore links containing pre-configured keywords
            ignore_current = False
            for i in range(0, len(TEST_FOUR_IGNORE_KEYWORDS)):
                if TEST_FOUR_IGNORE_KEYWORDS[i] in link:
                    ignore_current = True
                    break
            if ignore_current:
                continue

            if parse_domain(link) != parse_domain(TARGET_DOMAIN):
                # They're not the same domain. This link may be a route to the same website, let's check
                parsed = validate_url(link)
                if parsed:
                    link = TARGET_DOMAIN + f"/{parsed}"

            if not "https" in link:
                # HTTPS link was not found. Check if it's HTTP, otherwise throw a warning that none of them was found
                if 'http' in link:
                    errors.append(f"[ERROR] FOUND HTTP LINK ({link})")
                else:
                    warnings.append(f"[WARNING] UNRECOGNIZED LINK ({link})")
            errors = []
            total_links_tested += 1
        total_errors = len(errors)
        total_warnings = len(warnings)
        if total_errors > 0:
            for i in range(0, total_errors):
                self.prt(errors[i])

        if total_warnings > 0:
            for i in range(0, total_warnings):
                self.prt(warnings[i])

        if not total_errors:
            self.prt(f"[OK] Links tested: {total_links_tested}")
예제 #6
0
    def runTestThree(self):
        self.prt(">>> STARTING TEST THREE ( Sub-pages loading time test )")

        page_elements = []
        # Check if there are any DOM elements parsed for current page
        if TARGET_DOMAIN not in self.scraped_elements_for_url:
            self.prt(f"[{TARGET_DOMAIN}] Parsing href links", end="\r")
            page_elements = get_elements_by_selector(TARGET_DOMAIN, '[href]')

            if page_elements is None or len(page_elements) < 1:
                self.prt(
                    f"[ERROR] Could not parse any page links. Moving forward to next test"
                )
                return
            self.scraped_elements_for_url[TARGET_DOMAIN] = page_elements

        else:
            page_elements = self.scraped_elements_for_url[TARGET_DOMAIN]

        # Get links from DOM elements
        links = []
        for i in range(0, len(page_elements)):
            try:
                cur = page_elements[i]
                link = None
                if 'href' in cur['attributes']:
                    link = cur['attributes']['href']
                else:
                    continue
            except:
                continue
            links.append(link)

        total_links = len(links)

        errors = []
        for i in range(0, total_links):

            link = links[i]

            # Ignore DEAD links
            must_skip = True if link in self.dead_links_found else False

            if must_skip:
                print("Skipping from dead link")
                continue
            # Skip IGNORE keyords for test three
            for ignore in TEST_THREE_IGNORE_KEYWORDS:
                if ignore in link:
                    must_skip = True

            if must_skip:
                print("Skipping from ignore list")
                continue

            # Validate the link and check if they're on the same domain

            if parse_domain(link) != parse_domain(TARGET_DOMAIN):
                # They're not the same domain. This link may be a route to the same website, let's check
                parsed = validate_url(link)

                # if True is returned, then it's fine. Otherwise, it may be a route, so let's append it to the TARGET DOMAIN
                if parsed != True:
                    if parsed[0:2] == "//":
                        link = "https:" + parsed
                    else:
                        link = parsed.replace('//', '/')
                        link = TARGET_DOMAIN + "/" + link

                else:
                    continue

            pretty_print = f"/{parse_path(link)}"

            average_load_time = perform_load_test(
                link,
                5,
                onPerformingRetry=lambda retry: self.prt(
                    f"[{pretty_print}] Performing loading time test {retry} / {TEST_TWO_MAX_RETRIES}",
                    end="\r"))

            if average_load_time <= TEST_TWO_MAX_LOADING_SECONDS:
                self.prt(
                    f"[OK] ({pretty_print})  Average load time: {average_load_time}s"
                )
            else:
                self.prt(
                    f"[ERROR] ({pretty_print}) Average load time exceeded with: {average_load_time}s"
                )
예제 #7
0
 def remove_arc_service_from_uri(self, uri):
     if not validate_url(uri):
         raise Exception("Uri for arc service is not valid")
     arc_s = ([s for s in self.arc_services if s.uri == uri] + [None])[0]
     if arc_s is not None:
         self.remove_arc_service(arc_s)
예제 #8
0
 def add_arc_service_from_uri(self, uri):
     if not validate_url(uri):
         raise Exception("Uri for arc service is not valid")
     name = uri.split("/")[-1]
     arc_s = arc_service.ArcService(uri, name, self, self.db_client)
     self.add_arc_service(arc_s)