Exemplo n.º 1
0
    def get_requests_from_robots(self, request):
        purl = urlsplit(request.url)
        url = "%s://%s/robots.txt" % (purl.scheme, purl.netloc)

        getreq = Request(REQTYPE_LINK,
                         "GET",
                         url,
                         extra_headers=Shared.options['extra_headers'])
        try:
            # request, timeout, retries=None, useragent=None, proxy=None):
            httpget = HttpGet(getreq, 10, 1, "Googlebot",
                              Shared.options['proxy'])
            lines = httpget.get_file().split("\n")
        except urllib.error.HTTPError:
            return []
        except:
            return []
            #raise

        requests = []
        for line in lines:
            directive = ""
            url = None
            try:
                directive, url = re.sub("\#.*", "", line).split(":", 1)
            except:
                continue  # ignore errors

            if re.match("(dis)?allow", directive.strip(), re.I):
                req = Request(REQTYPE_LINK, "GET", url.strip(), parent=request)
                requests.append(req)

        return adjust_requests(requests) if requests else []
Exemplo n.º 2
0
    def test___eq__with_post(self, remove_tokens_mock):
        a = Request("type1", "POST", "url1", data="dataXXXX")
        b = Request("type1", "POST", "url1", data="dataYYYY")

        self.assertTrue(a == b)
        self.assertEqual(remove_tokens_mock.call_args_list,
                         [call("dataXXXX"), call("dataYYYY")])
Exemplo n.º 3
0
Arquivo: probe.py Projeto: zdoop/htcap
    def __init__(self, data, parent):
        self.status = "ok"
        self.requests = []
        self.cookies = []
        self.redirect = None
        # if True the probe returned no error BUT the json is not closed properly
        self.partialcontent = False
        self.html = None
        self.user_output = []
        self.page_hash = 0

        status = data.pop()

        if status['status'] == "error":
            self.status = "error"
            self.errcode = status['code']

        if "partialcontent" in status:
            self.partialcontent = status['partialcontent']

        # grap cookies before creating rquests
        for key, val in data:
            if key == "cookies":
                for cookie in val:
                    self.cookies.append(Cookie(cookie, parent.url))

        if "redirect" in status:
            self.redirect = status['redirect']
            r = Request(REQTYPE_REDIRECT,
                        "GET",
                        self.redirect,
                        parent=parent,
                        set_cookie=self.cookies,
                        parent_db_id=parent.db_id)
            self.requests.append(r)

        for key, val in data:
            if key == "request":
                trigger = val['trigger'] if 'trigger' in val else None
                #try:
                r = Request(val['type'],
                            val['method'],
                            val['url'],
                            parent=parent,
                            set_cookie=self.cookies,
                            data=val['data'],
                            trigger=trigger,
                            parent_db_id=parent.db_id)
                self.requests.append(r)
                #except Exception as e:
                #	pass
            elif key == "html":
                self.html = val
            elif key == "page_hash":
                page_hash = TextHash(val).hash
                self.page_hash = page_hash if page_hash else 0
            elif key == "user":
                self.user_output.append(val)
Exemplo n.º 4
0
    def get_requests(self, types="xhr"):
        """
        return a list of request matching the given types
    
        connect, retrieve the requests list then close the connection
    
        :param types: string of types (comma separated)
        :return: list of matching request
        """
        types = types.split(",")
        ret = []
        qry = "SELECT * FROM request WHERE out_of_scope=0 AND type IN (%s)" % ",".join(
            "?" * len(types))

        self.connect()
        cur = self.conn.cursor()
        cur.execute(qry, types)
        for r in cur.fetchall():
            # !! parent must be null (or unset)
            req = Request(r['type'],
                          r['method'],
                          r['url'],
                          referer=r['referer'],
                          data=r['data'],
                          json_cookies=r['cookies'],
                          db_id=r['id'],
                          parent_db_id=r['id_parent'])
            ret.append(req)
        self.close()

        return ret
Exemplo n.º 5
0
    def get_request(self, id):
        req = None
        qry = "SELECT * FROM request WHERE out_of_scope=0 AND id=?"
        try:
            self.connect()
            cur = self.conn.cursor()
            cur.execute(qry, (str(id), ))
            r = cur.fetchone()
            # !! parent must be null (or unset)
            if r:
                req = Request(r['type'],
                              r['method'],
                              r['url'],
                              referer=r['referer'],
                              data=r['data'],
                              json_cookies=r['cookies'],
                              db_id=r['id'],
                              parent_db_id=r['id_parent'],
                              extra_headers=json.loads(r['extra_headers']))
            self.close()
        except Exception as e:
            raise
            print("44 %s" % str(e))

        return req
Exemplo n.º 6
0
    def get_requests(self, types="xhr"):
        types = types.split(",")
        ret = []
        qry = "SELECT * FROM request WHERE out_of_scope=0 AND type IN (" + ",".join(
            ["?" for _ in range(0, len(types))]) + ")"
        try:
            self.connect()
            cur = self.conn.cursor()
            cur.execute(qry, types)
            for r in cur.fetchall():
                # !! parent must be null (or unset)
                req = Request(r['type'],
                              r['method'],
                              r['url'],
                              referer=r['referer'],
                              data=r['data'],
                              json_cookies=r['cookies'],
                              db_id=r['id'],
                              parent_db_id=r['id_parent'])
                ret.append(req)
            self.close()
        except Exception as e:
            print str(e)

        return ret
Exemplo n.º 7
0
    def get_requests(self, types="xhr", where=None):
        types = types.split(",")
        ret = []
        qry = "SELECT * FROM request WHERE out_of_scope=0 AND type IN (%s) and %s order by id desc" % (
            ",".join("?" * len(types)), "1" if where is None else where)
        try:
            self.connect()
            cur = self.conn.cursor()
            cur.execute(qry, types)
            for r in cur.fetchall():
                # !! parent must be null (or unset)
                req = Request(r['type'],
                              r['method'],
                              r['url'],
                              referer=r['referer'],
                              data=r['data'],
                              json_cookies=r['cookies'],
                              db_id=r['id'],
                              parent_db_id=r['id_parent'],
                              extra_headers=json.loads(r['extra_headers']))
                ret.append(req)
            self.close()
        except Exception as e:
            print("114 %s" % str(e))

        return ret
Exemplo n.º 8
0
    def __init__(self, data, parent):
        self.status = "ok"
        self.requests = []
        self.cookies = []
        self.redirect = []
        self.errmessage = ""
        # if True the probe returned no error BUT the json is not closed properly
        self.partialcontent = False
        self.html = None
        self.user_output = []
        self.page_hash = 0

        status = data["status"]

        if status == "error":
            self.status = "error"
            self.errmessage = data["errors"]

        # grap cookies before creating rquests
        for cookie in data["cookies"]:
            self.cookies.append(Cookie(cookie, parent.url))

        for redirect in data['redirect']:
            r = Request(REQTYPE_REDIRECT,
                        "GET",
                        redirect,
                        parent=parent,
                        set_cookie=self.cookies,
                        parent_db_id=parent.db_id)
            self.redirect.append(r)

        requests = data["requests"]
        for request in requests:
            request = json.loads(request)
            r = Request(request['type'],
                        request['method'],
                        request['url'],
                        parent=parent,
                        parent_db_id=parent.db_id,
                        set_cookie=self.cookies,
                        data=request['data'],
                        trigger=request.get("trigger", None),
                        extra_headers=request.get("extra_headers", None))
            self.requests.append(r)
Exemplo n.º 9
0
 def test_set_params_for_probe(self):
     req = Request("type1",
                   "POST",
                   "http://example.com",
                   data="example data",
                   http_auth="auth1")
     Shared.options['set_referer'] = None
     thread = CrawlerThread()
     params = thread._set_probe_params(req)
     print(req)
     self.assertIn("http://example.com/", params)
     pass
Exemplo n.º 10
0
    def _get_requests_from_robots(start_request):
        """
        read robots.txt file (if any) and create a list of request based on it's content

        :return: list of request
        """
        purl = urlsplit(start_request.url)
        url = "%s://%s/robots.txt" % (purl.scheme, purl.netloc)

        getreq = Request(REQTYPE_LINK, "GET", url)
        try:
            # request, timeout, retries=None, user_agent=None, proxy=None):
            httpget = HttpGet(getreq, 10, 1, "Googlebot",
                              Shared.options['proxy'])
            lines = httpget.get_file().split("\n")
        except urllib2.HTTPError:
            return []
        except:
            raise

        requests = []
        for line in lines:
            directive = ""
            url = None
            try:
                directive, url = re.sub("\#.*", "", line).split(":", 1)
            except Exception as e:
                print(str(e))
                continue  # ignore errors

            if re.match("(dis)?allow", directive.strip(), re.I):
                req = Request(REQTYPE_LINK,
                              "GET",
                              url.strip(),
                              parent=start_request)
                if request_is_crawlable(req):
                    requests.append(req)

        return adjust_requests(requests) if requests else []
Exemplo n.º 11
0
    def get_requests(self):
        requests = []

        try:
            headers = {
                "user-agent": self.useragent,
            }
            headers.update(self.extra_headers)

            res = reqlib.request(method=self.request.method,
                                 url=self.request.url,
                                 verify=False,
                                 timeout=self.timeout,
                                 cookies=toReqCok(self.request.cookies),
                                 proxies=self.proxy)
        except Exception as e:
            raise e

        log.debug("HttpGet get_requests ===> %s,%d,%d" %
                  (self.request.url, res.status_code, len(res.text)))

        if res.headers["content-type"] is not None and res.headers[
                'content-type'].lower().split(";")[0] != "text/html":
            raise NotHtmlException(ERROR_CONTENTTYPE)

        if res.content is None:
            raise NotHtmlException

        try:
            urls = get_urls(res.text)
            for url in urls:
                # @TODO handle FORMS
                requests.append(
                    Request(REQTYPE_LINK,
                            "GET",
                            url,
                            parent=self.request,
                            set_cookie=res.headers["set_cookie"],
                            parent_db_id=self.request.db_id))
        except Exception as e:
            raise e

        return requests
Exemplo n.º 12
0
 def rawsend(self,
             url,
             method=None,
             data=None,
             cookies=None,
             user_agent=None,
             proxy=None,
             extra_headers=None,
             req_timeout=5,
             ignore_errors=False):
     if not method:
         method = METHOD_GET
     req = Request(REQTYPE_LINK, method, url)
     http = HttpGet(req,
                    req_timeout,
                    proxy=proxy,
                    useragent=user_agent,
                    extra_headers=extra_headers)
     return http.send_request(method=method,
                              url=url,
                              data=data,
                              cookies=cookies,
                              ignore_errors=ignore_errors)
Exemplo n.º 13
0
    def get_not_crawled_request(self):
        """
        connect, retrieve existing never crawled requests then close the connection
        :return: list of request
        """
        requests = []
        query = "SELECT * FROM request WHERE crawled=0 AND out_of_scope=0"

        self.connect()
        cur = self.conn.cursor()
        cur.execute(query)
        for request in cur.fetchall():
            req = Request(request['type'],
                          request['method'],
                          request['url'],
                          referer=request['referer'],
                          data=request['data'],
                          json_cookies=request['cookies'],
                          db_id=request['id'],
                          parent_db_id=request['id_parent'])
            requests.append(req)
        self.close()

        return requests
Exemplo n.º 14
0
    def main(self, argv):
        Shared.options = self.defaults
        Shared.th_condition = threading.Condition()
        Shared.main_condition = threading.Condition()

        deps_errors = check_dependences(self.base_dir)
        if len(deps_errors) > 0:
            print("Dependences errors: ")
            for err in deps_errors:
                print("  %s" % err)
            sys.exit(1)

        start_cookies = []
        start_referer = None

        probe_options = ["-R", self.randstr(20)]
        threads = []
        num_threads = self.defaults['num_threads']

        out_file = ""
        out_file_overwrite = self.defaults['out_file_overwrite']
        cookie_string = None
        initial_checks = True
        http_auth = None
        get_robots_txt = True
        save_html = False

        try:
            opts, args = getopt.getopt(
                argv, 'hc:t:jn:x:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:OvelE:L:Mg:')
        except getopt.GetoptError as err:
            print(str(err))
            sys.exit(1)

        if len(args) < 2:
            self.usage()
            sys.exit(1)

        for o, v in opts:
            if o == '-h':
                self.usage()
                sys.exit(0)
            elif o == '-c':
                cookie_string = v
            elif o == '-C':
                try:
                    with open(v) as cf:
                        cookie_string = cf.read()
                except Exception as e:
                    print("error reading cookie file")
                    sys.exit(1)
            elif o == '-r':
                start_referer = v
            elif o == '-n':
                num_threads = int(v)
            elif o == '-t':
                Shared.options['process_timeout'] = int(v)
            elif o == '-q':
                self.display_progress = False
            elif o == '-A':
                http_auth = v
            elif o == '-p':
                try:
                    Shared.options['proxy'] = parse_proxy_string(v)
                except Exception as e:
                    print(e)
                    sys.exit(1)
            elif o == '-d':
                for ad in v.split(","):
                    # convert *.domain.com to *.\.domain\.com
                    pattern = re.escape(ad).replace("\\*\\.", "((.*\\.)|)")
                    Shared.allowed_domains.add(pattern)
            elif o == '-x':
                for eu in v.split(","):
                    try:
                        re.match(eu, "")
                    except:
                        print("* ERROR: regex failed: %s" % eu)
                        sys.exit(1)
                    Shared.excluded_urls.add(eu)
            elif o == "-G":
                Shared.options['group_qs'] = True
            elif o == "-w":
                out_file_overwrite = True
            elif o == "-R":
                Shared.options['max_redirects'] = int(v)
            elif o == "-U":
                Shared.options['useragent'] = v
            elif o == "-s":
                if not v in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY,
                             CRAWLSCOPE_URL):
                    self.usage()
                    print("* ERROR: wrong scope set '%s'" % v)
                    sys.exit(1)
                Shared.options['scope'] = v
            elif o == "-m":
                if not v in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE,
                             CRAWLMODE_AGGRESSIVE):
                    self.usage()
                    print("* ERROR: wrong mode set '%s'" % v)
                    sys.exit(1)
                Shared.options['mode'] = v
            elif o == "-S":
                initial_checks = False
            elif o == "-I":
                get_robots_txt = False
            elif o == "-H":
                save_html = True
            elif o == "-D":
                Shared.options['max_depth'] = int(v)
            elif o == "-P":
                Shared.options['max_post_depth'] = int(v)
            elif o == "-O":
                Shared.options['override_timeout_functions'] = False
            elif o == "-F":
                Shared.options['crawl_forms'] = False
            elif o == "-v":
                self.verbose = True
            elif o == "-e":
                Shared.options['deduplicate_pages'] = False
            elif o == "-l":
                Shared.options['headless_chrome'] = False
            elif o == "-M":
                Shared.options['simulate_real_events'] = False
            elif o == "-E":
                if not Shared.options['extra_headers']:
                    Shared.options['extra_headers'] = {}
                (hn, hv) = v.split("=", 1)
                Shared.options['extra_headers'][hn] = hv
            elif o == "-L":
                try:
                    with open(v) as cf:
                        Shared.options['login_sequence'] = json.loads(
                            cf.read())
                        Shared.options['login_sequence'][
                            "__file__"] = os.path.abspath(v)
                except ValueError as e:
                    print("* ERROR: decoding login sequence")
                    sys.exit(1)
                except Exception as e:
                    print("* ERROR: login sequence file not found")
                    sys.exit(1)
            elif o == "-g":
                if not Shared.options['local_storage']:
                    Shared.options['local_storage'] = {}
                (hn, hv) = v.split("=", 1)
                ktks = hn.split(":", 1)
                if len(ktks) != 2 or ktks[0] not in ("L", "S"):
                    print(
                        "Error: the -g option must be in the form '[L|S]:key=value', use 'L' to set locaStorage and 'S' to set sessionStorage"
                    )
                    sys.exit(1)
                Shared.options['local_storage'][ktks[1]] = {
                    "type": ktks[0],
                    "value": hv
                }

        probe_cmd = get_node_cmd()
        if not probe_cmd:  # maybe useless
            print("Error: unable to find node executable")
            sys.exit(1)

        if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len(
                Shared.allowed_domains) > 0:
            print("* Warinig: option -d is valid only if scope is %s" %
                  CRAWLSCOPE_DOMAIN)

        if cookie_string:
            try:
                start_cookies = parse_cookie_string(cookie_string)
            except Exception as e:
                print("error decoding cookie string")
                sys.exit(1)

        if Shared.options['mode'] != CRAWLMODE_AGGRESSIVE:
            probe_options.append("-f")  # dont fill values
        if Shared.options['mode'] == CRAWLMODE_PASSIVE:
            probe_options.append("-t")  # dont trigger events

        if Shared.options['proxy']:
            probe_options.extend([
                "-y",
                "%s:%s:%s" % (Shared.options['proxy']['proto'],
                              Shared.options['proxy']['host'],
                              Shared.options['proxy']['port'])
            ])
        if not Shared.options['headless_chrome']:
            probe_options.append("-l")
        probe_cmd.append(os.path.join(self.base_dir, 'probe', 'analyze.js'))

        if len(Shared.excluded_urls) > 0:
            probe_options.extend(("-X", ",".join(Shared.excluded_urls)))

        if save_html:
            probe_options.append("-H")

        probe_options.extend(("-x", str(Shared.options['process_timeout'])))
        probe_options.extend(("-A", Shared.options['useragent']))

        if not Shared.options['override_timeout_functions']:
            probe_options.append("-O")

        if Shared.options['extra_headers']:
            probe_options.extend(
                ["-E", json.dumps(Shared.options['extra_headers'])])

        if Shared.options['local_storage']:
            probe_options.extend(
                ["-g", json.dumps(Shared.options['local_storage'])])

        if not Shared.options['simulate_real_events']:
            probe_options.append("-M")

        Shared.probe_cmd = probe_cmd + probe_options

        Shared.starturl = normalize_url(args[0])
        out_file = args[1]

        purl = urlsplit(Shared.starturl)
        Shared.allowed_domains.add(purl.hostname)

        if Shared.options['login_sequence'] and Shared.options[
                'login_sequence']['type'] == LOGSEQTYPE_SHARED:
            login_req = Request(REQTYPE_LINK,
                                "GET",
                                Shared.options['login_sequence']['url'],
                                set_cookie=Shared.start_cookies,
                                http_auth=http_auth,
                                referer=start_referer,
                                extra_headers=Shared.options['extra_headers'])
            stdoutw("Logging in . . . ")
            try:
                pe = ProbeExecutor(
                    login_req,
                    Shared.probe_cmd + ["-z"],
                    login_sequence=Shared.options['login_sequence'])
                probe = pe.execute()
                if not probe:
                    print("\n* ERROR: login sequence failed to execute probe")
                    sys.exit(1)
                if probe.status == "ok":
                    for c in probe.cookies:
                        if not Shared.options['login_sequence'][
                                'cookies'] or c.name in Shared.options[
                                    'login_sequence']['cookies']:
                            Shared.start_cookies.append(c)
                else:
                    print("\n* ERROR: login sequence failed:\n   %s" %
                          probe.errmessage)
                    sys.exit(1)
            except KeyboardInterrupt:
                pe.terminate()
                print("\nAborted")
                sys.exit(0)
            print("done")

        for sc in start_cookies:
            Shared.start_cookies.append(Cookie(sc, Shared.starturl))

        start_req = Request(REQTYPE_LINK,
                            "GET",
                            Shared.starturl,
                            set_cookie=Shared.start_cookies,
                            http_auth=http_auth,
                            referer=start_referer,
                            extra_headers=Shared.options['extra_headers'])

        if not hasattr(ssl, "SSLContext"):
            print(
                "* WARNING: SSLContext is not supported with this version of python, consider to upgrade to >= 2.7.9 in case of SSL errors"
            )

        stdoutw("Initializing . ")

        start_requests = self.init_crawl(start_req, initial_checks,
                                         get_robots_txt)

        database = None
        self.db_file = self.generate_filename(out_file, out_file_overwrite)
        try:
            database = self.init_db(self.db_file, out_file)
        except Exception as e:
            print(str(e))
            sys.exit(1)

        database.save_crawl_info(
            htcap_version=get_program_infos()['version'],
            target=Shared.starturl,
            start_date=self.crawl_start_time,
            commandline=cmd_to_str(argv),
            user_agent=Shared.options['useragent'],
            proxy=json.dumps(Shared.options['proxy']),
            extra_headers=json.dumps(Shared.options['extra_headers']),
            cookies=json.dumps([x.get_dict() for x in Shared.start_cookies]))

        database.connect()
        database.begin()
        for req in start_requests:
            database.save_request(req)
        database.commit()
        database.close()

        print("done")
        print(
            "Database %s initialized, crawl started with %d threads (^C to pause or change verbosity)"
            % (self.db_file, num_threads))

        for n in range(0, num_threads):
            thread = CrawlerThread()
            threads.append(thread)
            thread.start()

        self.main_loop(threads, start_requests, database)

        self.kill_threads(threads)

        self.crawl_end_time = int(time.time())

        print("Crawl finished, %d pages analyzed in %d minutes" %
              (Shared.requests_index,
               (self.crawl_end_time - self.crawl_start_time) // 60))

        database.save_crawl_info(end_date=self.crawl_end_time)
Exemplo n.º 15
0
    def main(self, argv):
        Shared.options = self.defaults
        Shared.th_condition = threading.Condition()
        Shared.main_condition = threading.Condition()

        deps_errors = check_dependences(self.base_dir)
        if len(deps_errors) > 0:
            print "Dependences errors: "
            for err in deps_errors:
                print "  %s" % err
            sys.exit(1)

        start_cookies = []
        start_referer = None

        probe_options = ["-R", self.randstr(20)]
        threads = []
        num_threads = self.defaults['num_threads']

        out_file = ""
        out_file_overwrite = self.defaults['out_file_overwrite']
        cookie_string = None
        initial_checks = True
        http_auth = None
        get_robots_txt = True
        save_html = False

        try:
            opts, args = getopt.getopt(
                argv, 'hc:t:jn:x:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:OveLlE:')
        except getopt.GetoptError as err:
            print str(err)
            sys.exit(1)

        if len(args) < 2:
            self.usage()
            sys.exit(1)

        for o, v in opts:
            if o == '-h':
                self.usage()
                sys.exit(0)
            elif o == '-c':
                cookie_string = v
            elif o == '-C':
                try:
                    with open(v) as cf:
                        cookie_string = cf.read()
                except Exception as e:
                    print "error reading cookie file"
                    sys.exit(1)
            elif o == '-r':
                start_referer = v
            elif o == '-n':
                num_threads = int(v)
            elif o == '-t':
                Shared.options['process_timeout'] = int(v)
            elif o == '-q':
                self.display_progress = False
            elif o == '-A':
                http_auth = v
            elif o == '-p':
                try:
                    Shared.options['proxy'] = parse_proxy_string(v)
                except Exception as e:
                    print e
                    sys.exit(1)
            elif o == '-d':
                for ad in v.split(","):
                    # convert *.domain.com to *.\.domain\.com
                    pattern = re.escape(ad).replace("\\*\\.", "((.*\\.)|)")
                    Shared.allowed_domains.add(pattern)
            elif o == '-x':
                for eu in v.split(","):
                    Shared.excluded_urls.add(eu)
            elif o == "-G":
                Shared.options['group_qs'] = True
            elif o == "-w":
                out_file_overwrite = True
            elif o == "-R":
                Shared.options['max_redirects'] = int(v)
            elif o == "-U":
                Shared.options['useragent'] = v
            elif o == "-s":
                if not v in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY,
                             CRAWLSCOPE_URL):
                    self.usage()
                    print "* ERROR: wrong scope set '%s'" % v
                    sys.exit(1)
                Shared.options['scope'] = v
            elif o == "-m":
                if not v in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE,
                             CRAWLMODE_AGGRESSIVE):
                    self.usage()
                    print "* ERROR: wrong mode set '%s'" % v
                    sys.exit(1)
                Shared.options['mode'] = v
            elif o == "-S":
                initial_checks = False
            elif o == "-I":
                get_robots_txt = False
            elif o == "-H":
                save_html = True
            elif o == "-D":
                Shared.options['max_depth'] = int(v)
            elif o == "-P":
                Shared.options['max_post_depth'] = int(v)
            elif o == "-O":
                Shared.options['override_timeout_functions'] = False
            elif o == "-F":
                Shared.options['crawl_forms'] = False
            elif o == "-v":
                self.verbose = True
            elif o == "-e":
                Shared.options['deduplicate_pages'] = False
            elif o == "-L":
                Shared.options['use_legacy_browser'] = True
            elif o == "-l":
                Shared.options['headless_chrome'] = False
            elif o == "-E":
                if not Shared.options['extra_headers']:
                    Shared.options['extra_headers'] = {}
                (hn, hv) = v.split("=", 1)
                Shared.options['extra_headers'][hn] = hv

        probe_cmd = get_phantomjs_cmd(
        ) if Shared.options['use_legacy_browser'] else get_node_cmd()
        if not probe_cmd:  # maybe useless
            print "Error: unable to find node (or phantomjs) executable"
            sys.exit(1)

        if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len(
                Shared.allowed_domains) > 0:
            print "* Warinig: option -d is valid only if scope is %s" % CRAWLSCOPE_DOMAIN

        if cookie_string:
            try:
                start_cookies = parse_cookie_string(cookie_string)
            except Exception as e:
                print "error decoding cookie string"
                sys.exit(1)

        if Shared.options['mode'] != CRAWLMODE_AGGRESSIVE:
            probe_options.append("-f")  # dont fill values
        if Shared.options['mode'] == CRAWLMODE_PASSIVE:
            probe_options.append("-t")  # dont trigger events

        if Shared.options['use_legacy_browser']:
            if Shared.options['proxy']:
                probe_cmd.append("--proxy-type=%s" %
                                 Shared.options['proxy']['proto'])
                probe_cmd.append("--proxy=%s:%s" %
                                 (Shared.options['proxy']['host'],
                                  Shared.options['proxy']['port']))
            probe_cmd.append(os.path.join(self.base_dir, 'probe',
                                          'analyze.js'))
        else:
            if Shared.options['proxy']:
                probe_options.extend([
                    "-y",
                    "%s:%s:%s" % (Shared.options['proxy']['proto'],
                                  Shared.options['proxy']['host'],
                                  Shared.options['proxy']['port'])
                ])
            if not Shared.options['headless_chrome']:
                probe_options.append("-l")
            probe_cmd.append(
                os.path.join(self.base_dir, 'probe', 'chrome-probe',
                             'analyze.js'))

        if len(Shared.excluded_urls) > 0:
            probe_options.extend(("-X", ",".join(Shared.excluded_urls)))

        if save_html:
            probe_options.append("-H")

        probe_options.extend(("-x", str(Shared.options['process_timeout'])))
        probe_options.extend(("-A", Shared.options['useragent']))

        if not Shared.options['override_timeout_functions']:
            probe_options.append("-O")

        if Shared.options['extra_headers']:
            probe_options.extend(
                ["-E", json.dumps(Shared.options['extra_headers'])])

        Shared.probe_cmd = probe_cmd + probe_options

        Shared.starturl = normalize_url(args[0])
        out_file = args[1]

        purl = urlsplit(Shared.starturl)
        Shared.allowed_domains.add(purl.hostname)

        for sc in start_cookies:
            Shared.start_cookies.append(Cookie(sc, Shared.starturl))

        start_req = Request(REQTYPE_LINK,
                            "GET",
                            Shared.starturl,
                            set_cookie=Shared.start_cookies,
                            http_auth=http_auth,
                            referer=start_referer)

        if not hasattr(ssl, "SSLContext"):
            print "* WARNING: SSLContext is not supported with this version of python, consider to upgrade to >= 2.7.9 in case of SSL errors"

        stdoutw("Initializing . ")

        start_requests = self.init_crawl(start_req, initial_checks,
                                         get_robots_txt)

        database = None
        self.db_file = self.generate_filename(out_file, out_file_overwrite)
        try:
            database = self.init_db(self.db_file, out_file)
        except Exception as e:
            print str(e)
            sys.exit(1)

        database.save_crawl_info(htcap_version=get_program_infos()['version'],
                                 target=Shared.starturl,
                                 start_date=self.crawl_start_time,
                                 commandline=cmd_to_str(argv),
                                 user_agent=Shared.options['useragent'],
                                 proxy=json.dumps(Shared.options['proxy']),
                                 extra_headers=json.dumps(
                                     Shared.options['extra_headers']),
                                 cookies=json.dumps(start_cookies))

        database.connect()
        database.begin()
        for req in start_requests:
            database.save_request(req)
        database.commit()
        database.close()

        print "done"
        print "Database %s initialized, crawl started with %d threads" % (
            self.db_file, num_threads)

        for n in range(0, num_threads):
            thread = CrawlerThread()
            threads.append(thread)
            thread.start()

        self.main_loop(threads, start_requests, database)

        self.kill_threads(threads)

        self.crawl_end_time = int(time.time())

        print "Crawl finished, %d pages analyzed in %d minutes" % (
            Shared.requests_index,
            (self.crawl_end_time - self.crawl_start_time) / 60)

        database.save_crawl_info(end_date=self.crawl_end_time)
Exemplo n.º 16
0
    def test___eq__(self, remove_tokens_mock):
        a = Request("type1",
                    "method1",
                    "url1",
                    data="data1",
                    http_auth="auth1")
        b = Request("type1",
                    "method1",
                    "url1",
                    data="data1",
                    http_auth="auth1")
        self.assertTrue(a == b)

        a = Request("type1",
                    "method1",
                    "url1",
                    data="data1",
                    http_auth="auth1")
        b = Request("type2",
                    "method1",
                    "url1",
                    data="data1",
                    http_auth="auth1")
        self.assertFalse(a == b)

        a = Request("type1",
                    "method1",
                    "url1",
                    data="data1",
                    http_auth="auth1")
        b = Request("type1",
                    "method2",
                    "url1",
                    data="data1",
                    http_auth="auth1")
        self.assertFalse(a == b)

        a = Request("type1",
                    "method1",
                    "url1",
                    data="data1",
                    http_auth="auth1")
        b = Request("type1",
                    "method1",
                    "url2",
                    data="data1",
                    http_auth="auth1")
        self.assertFalse(a == b)

        a = Request("type1",
                    "method1",
                    "url1",
                    data="data1",
                    http_auth="auth1")
        b = Request("type1",
                    "method1",
                    "url1",
                    data="data2",
                    http_auth="auth1")
        self.assertFalse(a == b)

        a = Request("type1",
                    "method1",
                    "url1",
                    data="data1",
                    http_auth="auth1")
        b = Request("type1",
                    "method1",
                    "url1",
                    data="data1",
                    http_auth="auth2")
        self.assertFalse(a == b)

        a = Request("type1", "method1", "url1")
        b = None
        self.assertFalse(a == b)
        self.assertEqual(remove_tokens_mock.call_count, 0)
Exemplo n.º 17
0
    def run(self):

        # get database
        try:
            database = self._get_database(self._outfile_name,
                                          self._output_mode)

            crawl_id = database.save_crawl_info(
                htcap_version=get_program_infos()['version'],
                target=Shared.start_url,
                start_date=self.crawl_start_date,
                commandline=cmd_to_str(self.arg),
                user_agent=Shared.options['user_agent'],
                start_cookies=Shared.start_cookies)

            # if the current crawl is not the first one
            if crawl_id > 1:

                # retrieving options from the last crawl
                random_seed, cookies = database.retrieve_crawl_info(crawl_id -
                                                                    1)

                # if the db had a seed and none were provided before
                if random_seed and not Shared.options.get("random_seed"):
                    Shared.options["random_seed"] = random_seed

                # if no cookie was provided and some exist from the last crawl
                if len(Shared.start_cookies
                       ) <= 0 and cookies != "[]" and cookies is not None:
                    for cookie_string in self._parse_cookie_string(cookies):
                        Shared.start_cookies.append(Cookie(cookie_string))

            # if no seed have been set yet
            if not Shared.options.get("random_seed"):
                Shared.options["random_seed"] = self._generate_random_string(
                    20)

        except Exception as e:
            print(str(e))
            sys.exit(1)

        # set probe arguments
        self._set_probe()

        Shared.probe_cmd = self._probe["cmd"] + self._probe["options"]

        start_requests = []

        # create the start request object from provided arguments
        start_request_from_args = Request(REQTYPE_LINK,
                                          "GET",
                                          Shared.start_url,
                                          set_cookie=Shared.start_cookies,
                                          http_auth=self._http_auth,
                                          referer=self._start_referer)

        def _is_not_in_past_requests(request):
            """
            check if the given request is present in Shared.requests or start_requests
            """
            is_in_request = True
            for r in Shared.requests + start_requests:
                if r == request:
                    is_in_request = False
            return is_in_request

        # check starting url
        if self._initial_checks:
            try:
                self._check_request(start_request_from_args)
                stdoutw(". ")
            except KeyboardInterrupt:
                print("\nAborted")
                sys.exit(0)

        if self._output_mode in (CRAWLOUTPUT_RESUME, CRAWLOUTPUT_COMPLETE):
            try:
                # make the start url given in arguments crawlable again
                database.connect()
                database.save_request(start_request_from_args)
                database.make_request_crawlable(start_request_from_args)
                database.commit()
                database.close()

                # feeding the "done" request list from the db
                Shared.requests.extend(database.get_crawled_request())
                Shared.requests_index = len(Shared.requests)

                # if resume, add requests from db
                if self._output_mode == CRAWLOUTPUT_RESUME:
                    start_requests.extend(database.get_not_crawled_request())

                # if request from args is neither in past or future requests
                if _is_not_in_past_requests(start_request_from_args):
                    start_requests.append(start_request_from_args)
            except Exception as e:
                print(str(e))
                sys.exit(1)
        else:
            start_requests.append(start_request_from_args)

        # retrieving robots.txt content
        if self._get_robots_txt:
            try:
                start_requests.extend(
                    filter(
                        _is_not_in_past_requests,
                        self._get_requests_from_robots(
                            start_request_from_args)))
            except KeyboardInterrupt:
                print("\nAborted")
                sys.exit(0)

        # save starting request to db
        database.connect()
        database.begin()
        for req in start_requests:
            database.save_request(req)
        database.commit()
        database.close()

        print(
            "\nDone: {} starting url(s) and {} url(s) already crawled".format(
                len(start_requests), len(Shared.requests)))

        # starting crawling threads
        print("Database %s initialized, crawl starting with %d threads" %
              (database, self._num_threads))

        for n in range(0, self._num_threads):
            thread = CrawlerThread()
            self._threads.append(thread)
            thread.start()

        # running crawl loop
        self._main_loop(self._threads, start_requests, database,
                        self._display_progress, self._verbose)

        self._kill_threads(self._threads)

        self.crawl_end_date = int(time.time())

        print("Crawl finished, %d pages analyzed in %d minutes" %
              (Shared.requests_index,
               (self.crawl_end_date - self.crawl_start_date) / 60))

        # update end date in db
        database.update_crawl_info(crawl_id, self.crawl_end_date,
                                   Shared.options["random_seed"],
                                   Shared.end_cookies)
Exemplo n.º 18
0
    def get_requests(self):  # Shared.options['process_timeout']

        if self.request.method == "POST":
            raise Exception("POST method with urllib is not supported yet")

        #parent = self.request.parent.url if self.request.parent else ""

        self.retries_interval = 0.5

        jar_response = cookielib.LWPCookieJar()
        jar_request = cookielib.LWPCookieJar()

        html = ""
        set_cookie = []

        requests = []

        while True:
            try:
                #Shared.th_lock.acquire()

                for cookie in self.request.cookies:
                    jar_request.set_cookie(cookie.get_cookielib_cookie())

                #Shared.th_lock.release()

                opener = self.urllib2_opener(self.request, jar_response)
                req = urllib2.Request(url=self.request.url)
                jar_request.add_cookie_header(req)

                res = opener.open(req, None, self.timeout)

                for cookie in jar_response:
                    set_cookie.append(Cookie(cookie.__dict__,
                                             self.request.url))

                ctype = res.info(
                )['Content-Type']  # @TODO !! WRONG!! (check if wrong...not sure)
                if ctype is not None:
                    if ctype.lower().split(";")[0] != "text/html":
                        opener.close()
                        raise NotHtmlException(ERROR_CONTENTTYPE)

                html = res.read()
                opener.close()

                if html:
                    html = decode_bytes(html)
                    finder = UrlFinder(html)
                    try:
                        urls = finder.get_urls()
                    except Exception as e:
                        raise

                for url in urls:
                    # @TODO handle FORMS
                    requests.append(
                        Request(REQTYPE_LINK,
                                "GET",
                                url,
                                parent=self.request,
                                set_cookie=set_cookie,
                                parent_db_id=self.request.db_id))

                break

            except RedirectException as e:
                set_cookie = []
                for cookie in jar_response:
                    set_cookie.append(Cookie(cookie.__dict__,
                                             self.request.url))

                r = Request(REQTYPE_REDIRECT,
                            "GET",
                            str(e),
                            parent=self.request,
                            set_cookie=set_cookie,
                            parent_db_id=self.request.db_id)
                requests.append(r)
                break
            except NotHtmlException:
                raise
            except Exception as e:
                self.retries -= 1
                if self.retries == 0: raise
                time.sleep(self.retries_interval)

        return requests
Exemplo n.º 19
0
    def main(self, argv):
        Shared.options = self.defaults
        Shared.th_condition = threading.Condition()
        Shared.main_condition = threading.Condition()

        probe_cmd = get_phantomjs_cmd()
        if not probe_cmd:
            print "Error: unable to find phantomjs executable"
            sys.exit(1)

        start_cookies = []
        start_referer = None

        probe_options = ["-R", self.randstr(20)]
        threads = []
        num_threads = self.defaults['num_threads']

        out_file = ""
        out_file_overwrite = self.defaults['out_file_overwrite']
        cookie_string = None
        display_progress = True
        verbose = False
        initial_checks = True
        http_auth = None
        get_robots_txt = True
        save_html = False
        user_script = None

        try:
            opts, args = getopt.getopt(
                argv, 'hc:t:jn:x:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:Ovu:')
        except getopt.GetoptError as err:
            print str(err)
            sys.exit(1)

        if len(args) < 2:
            self.usage()
            sys.exit(1)

        for o, v in opts:
            if o == '-h':
                self.usage()
                sys.exit(0)
            elif o == '-c':
                cookie_string = v
            elif o == '-C':
                try:
                    with open(v) as cf:
                        cookie_string = cf.read()
                except Exception as e:
                    print "error reading cookie file"
                    sys.exit(1)
            elif o == '-r':
                start_referer = v
            elif o == '-n':
                num_threads = int(v)
            elif o == '-t':
                Shared.options['process_timeout'] = int(v)
            elif o == '-q':
                display_progress = False
            elif o == '-A':
                http_auth = v
            elif o == '-p':
                if v == "tor": v = "socks5:127.0.0.1:9150"
                proxy = v.split(":")
                if proxy[0] not in ("http", "socks5"):
                    print "only http and socks5 proxies are supported"
                    sys.exit(1)
                Shared.options['proxy'] = {
                    "proto": proxy[0],
                    "host": proxy[1],
                    "port": proxy[2]
                }
            elif o == '-d':
                for ad in v.split(","):
                    # convert *.domain.com to *.\.domain\.com
                    pattern = re.escape(ad).replace("\\*\\.", "((.*\\.)|)")
                    Shared.allowed_domains.add(pattern)
            elif o == '-x':
                for eu in v.split(","):
                    Shared.excluded_urls.add(eu)
            elif o == "-G":
                Shared.options['group_qs'] = True
            elif o == "-w":
                out_file_overwrite = True
            elif o == "-R":
                Shared.options['max_redirects'] = int(v)
            elif o == "-U":
                Shared.options['useragent'] = v
            elif o == "-s":
                if not v in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY,
                             CRAWLSCOPE_URL):
                    self.usage()
                    print "* ERROR: wrong scope set '%s'" % v
                    sys.exit(1)
                Shared.options['scope'] = v
            elif o == "-m":
                if not v in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE,
                             CRAWLMODE_AGGRESSIVE):
                    self.usage()
                    print "* ERROR: wrong mode set '%s'" % v
                    sys.exit(1)
                Shared.options['mode'] = v
            elif o == "-S":
                initial_checks = False
            elif o == "-I":
                get_robots_txt = False
            elif o == "-H":
                save_html = True
            elif o == "-D":
                Shared.options['max_depth'] = int(v)
            elif o == "-P":
                Shared.options['max_post_depth'] = int(v)
            elif o == "-O":
                Shared.options['override_timeout_functions'] = False
            elif o == "-F":
                Shared.options['crawl_forms'] = False
            elif o == "-v":
                verbose = True
            elif o == "-u":
                if os.path.isfile(v):
                    user_script = os.path.abspath(v)
                else:
                    print "error: unable to open USER_SCRIPT"
                    sys.exit(1)

        if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len(
                Shared.allowed_domains) > 0:
            print "* Warinig: option -d is valid only if scope is %s" % CRAWLSCOPE_DOMAIN

        if cookie_string:
            try:
                start_cookies = self.parse_cookie_string(cookie_string)
            except Exception as e:
                print "error decoding cookie string"
                sys.exit(1)

        if Shared.options['mode'] != CRAWLMODE_AGGRESSIVE:
            probe_options.append("-f")  # dont fill values
        if Shared.options['mode'] == CRAWLMODE_PASSIVE:
            probe_options.append("-t")  # dont trigger events

        if Shared.options['proxy']:
            probe_cmd.append("--proxy-type=%s" %
                             Shared.options['proxy']['proto'])
            probe_cmd.append("--proxy=%s:%s" %
                             (Shared.options['proxy']['host'],
                              Shared.options['proxy']['port']))

        probe_cmd.append(self.base_dir + 'probe/analyze.js')

        if len(Shared.excluded_urls) > 0:
            probe_options.extend(("-X", ",".join(Shared.excluded_urls)))

        if save_html:
            probe_options.append("-H")

        if user_script:
            probe_options.extend(("-u", user_script))

        probe_options.extend(("-x", str(Shared.options['process_timeout'])))
        probe_options.extend(("-A", Shared.options['useragent']))

        if not Shared.options['override_timeout_functions']:
            probe_options.append("-O")

        Shared.probe_cmd = probe_cmd + probe_options

        Shared.starturl = normalize_url(args[0])
        out_file = args[1]

        purl = urlsplit(Shared.starturl)
        Shared.allowed_domains.add(purl.hostname)

        for sc in start_cookies:
            Shared.start_cookies.append(Cookie(sc, Shared.starturl))

        start_req = Request(REQTYPE_LINK,
                            "GET",
                            Shared.starturl,
                            set_cookie=Shared.start_cookies,
                            http_auth=http_auth,
                            referer=start_referer)

        if not hasattr(ssl, "SSLContext"):
            print "* WARNING: SSLContext is not supported with this version of python, consider to upgrade to >= 2.7.9 in case of SSL errors"

        stdoutw("Initializing . ")

        if user_script and initial_checks:
            self.check_user_script_syntax(probe_cmd, user_script)

        start_requests = self.init_crawl(start_req, initial_checks,
                                         get_robots_txt)

        database = None
        fname = self.generate_filename(out_file, out_file_overwrite)
        try:
            database = self.init_db(fname, out_file)
        except Exception as e:
            print str(e)
            sys.exit(1)

        database.save_crawl_info(htcap_version=get_program_infos()['version'],
                                 target=Shared.starturl,
                                 start_date=self.crawl_start_time,
                                 commandline=cmd_to_str(argv),
                                 user_agent=Shared.options['useragent'])

        database.connect()
        database.begin()
        for req in start_requests:
            database.save_request(req)
        database.commit()
        database.close()

        print "done"
        print "Database %s initialized, crawl started with %d threads" % (
            fname, num_threads)

        for n in range(0, num_threads):
            thread = CrawlerThread()
            threads.append(thread)
            thread.start()

        self.main_loop(threads, start_requests, database, display_progress,
                       verbose)

        self.kill_threads(threads)

        self.crawl_end_time = int(time.time())

        print "Crawl finished, %d pages analyzed in %d minutes" % (
            Shared.requests_index,
            (self.crawl_end_time - self.crawl_start_time) / 60)

        database.save_crawl_info(end_date=self.crawl_end_time)