Exemplo n.º 1
0
def extractor(url):
    """Extract details from the response body."""
    response = requester(url, main_url, delay, cook, headers, timeout, host,
                         proxies, user_agents, failed, processed)
    if clone:
        mirror(url, response)
    matches = rhref.findall(response)
    for link in matches:
        # Remove everything after a "#" to deal with in-page anchors
        link = link[1].replace('\'', '').replace('"', '').split('#')[0]
        # Checks if the URLs should be crawled
        if is_link(link, processed, files):
            if link[:4] == 'http':
                if link.startswith(main_url):
                    verb('Internal page', link)
                    internal.add(link)
                else:
                    verb('External page', link)
                    external.add(link)
            elif link[:2] == '//':
                if link.split('/')[2].startswith(host):
                    verb('Internal page', link)
                    internal.add(schema + '://' + link)
                else:
                    verb('External page', link)
                    external.add(link)
            elif link[:1] == '/':
                verb('Internal page', link)
                internal.add(remove_file(url) + link)
            else:
                verb('Internal page', link)
                usable_url = remove_file(url)
                if usable_url.endswith('/'):
                    internal.add(usable_url + link)
                elif link.startswith('/'):
                    internal.add(usable_url + link)
                else:
                    internal.add(usable_url + '/' + link)

    if not only_urls:
        intel_extractor(url, response)
        js_extractor(response)
    if args.regex and not supress_regex:
        regxy(args.regex, response, supress_regex, custom)
    if api:
        matches = rentropy.findall(response)
        for match in matches:
            if entropy(match) >= 4:
                verb('Key', match)
                keys.add(url + ': ' + match)
Exemplo n.º 2
0
def extractor(url):
    """Extract details from the response body."""
    response = requester(url, main_url, delay, cook, headers, timeout, host, proxies, user_agents, failed, processed)
    if clone:
        mirror(url, response)
    matches = rhref.findall(response)
    for link in matches:
        # Remove everything after a "#" to deal with in-page anchors
        link = link[1].replace('\'', '').replace('"', '').split('#')[0]
        # Checks if the URLs should be crawled
        if is_link(link, processed, files):
            if link[:4] == 'http':
                if link.startswith(main_url):
                    verb('Internal page', link)
                    internal.add(link)
                else:
                    verb('External page', link)
                    external.add(link)
            elif link[:2] == '//':
                if link.split('/')[2].startswith(host):
                    verb('Internal page', link)
                    internal.add(schema + '://' + link)
                else:
                    verb('External page', link)
                    external.add(link)
            elif link[:1] == '/':
                verb('Internal page', link)
                internal.add(remove_file(url) + link)
            else:
                verb('Internal page', link)
                usable_url = remove_file(url)
                if usable_url.endswith('/'):
                    internal.add(usable_url + link)
                elif link.startswith('/'):
                    internal.add(usable_url + link)
                else:
                    internal.add(usable_url + '/' + link)

    if not only_urls:
        intel_extractor(url, response)
        js_extractor(response)
    if args.regex and not supress_regex:
        regxy(args.regex, response, supress_regex, custom)
    if api:
        matches = rentropy.findall(response)
        for match in matches:
            if entropy(match) >= 4:
                verb('Key', match)
                keys.add(url + ': ' + match)