def __init__(self, data, parent): self.status = "ok" self.requests = [] self.cookies = [] self.redirect = None # if True the probe returned no error BUT the json is not closed properly self.partialcontent = False self.html = None self.user_output = [] self.page_hash = 0 status = data.pop() if status['status'] == "error": self.status = "error" self.errcode = status['code'] if "partialcontent" in status: self.partialcontent = status['partialcontent'] # grap cookies before creating rquests for key, val in data: if key == "cookies": for cookie in val: self.cookies.append(Cookie(cookie, parent.url)) if "redirect" in status: self.redirect = status['redirect'] r = Request(REQTYPE_REDIRECT, "GET", self.redirect, parent=parent, set_cookie=self.cookies, parent_db_id=parent.db_id) self.requests.append(r) for key, val in data: if key == "request": trigger = val['trigger'] if 'trigger' in val else None #try: r = Request(val['type'], val['method'], val['url'], parent=parent, set_cookie=self.cookies, data=val['data'], trigger=trigger, parent_db_id=parent.db_id) self.requests.append(r) #except Exception as e: # pass elif key == "html": self.html = val elif key == "page_hash": page_hash = TextHash(val).hash self.page_hash = page_hash if page_hash else 0 elif key == "user": self.user_output.append(val)
def __init__(self, data, parent): self.status = "ok" self.requests = [] self.cookies = [] self.redirect = [] self.errmessage = "" # if True the probe returned no error BUT the json is not closed properly self.partialcontent = False self.html = None self.user_output = [] self.page_hash = 0 status = data["status"] if status == "error": self.status = "error" self.errmessage = data["errors"] # grap cookies before creating rquests for cookie in data["cookies"]: self.cookies.append(Cookie(cookie, parent.url)) for redirect in data['redirect']: r = Request(REQTYPE_REDIRECT, "GET", redirect, parent=parent, set_cookie=self.cookies, parent_db_id=parent.db_id) self.redirect.append(r) requests = data["requests"] for request in requests: request = json.loads(request) r = Request(request['type'], request['method'], request['url'], parent=parent, parent_db_id=parent.db_id, set_cookie=self.cookies, data=request['data'], trigger=request.get("trigger", None), extra_headers=request.get("extra_headers", None)) self.requests.append(r)
def send_request( self, method=None, url=None, data=None, cookies=None, ignore_errors=False, follow_redirect=False): # Shared.options['process_timeout'] if not method: method = self.request.method if not url: url = self.request.url if method in ("POST", "PUT"): if not data: data = self.request.data if self.request.data else "" if not cookies: cookies = [] jar_request = cookielib.LWPCookieJar() ret = { "code": None, "url": None, "headers": None, "body": None, "time": None } while True: try: existing_cookies = [] for cookie in self.request.cookies: clc = cookie.get_cookielib_cookie() for c in cookies: if c['name'] == cookie.name: clc.value = c['value'] existing_cookies.append(c) jar_request.set_cookie(clc) for cookie in [ x for x in cookies if x not in existing_cookies ]: c = Cookie(cookie) # check what to do with cookie.setter jar_request.set_cookie(c.get_cookielib_cookie()) opener = self.urllib2_opener(self.request, None, follow_redirect) req = urllib2.Request(url=url, data=data.encode() if data else None) req.get_method = lambda: method jar_request.add_cookie_header(req) # headers = self.request.extra_headers # if self.extra_headers: # for h in self.extra_headers: # headers[h] = self.extra_headers[h] # for hn in headers: # req.add_header(hn, headers[hn]) if data and not 'Content-type' in req.headers: req.add_header("Content-type", detect_content_type(data)) now = time.time() try: res = opener.open(req, None, self.timeout) except urllib2.HTTPError as e: if not ignore_errors: raise res = e opener.close() ret['code'] = res.getcode() ret['url'] = res.geturl() #ret['headers'] = [x.strip() for x in res.info().headers] ret['headers'] = ["%s: %s" % x for x in res.info().items()] ret['body'] = res.read() ret['time'] = time.time() - now break except Exception as e: self.retries -= 1 if self.retries == 0: raise time.sleep(self.retries_interval) return ret
def get_requests(self): # Shared.options['process_timeout'] if self.request.method == "POST": raise Exception("POST method with urllib is not supported yet") #parent = self.request.parent.url if self.request.parent else "" self.retries_interval = 0.5 jar_response = cookielib.LWPCookieJar() jar_request = cookielib.LWPCookieJar() html = "" set_cookie = [] requests = [] while True: try: #Shared.th_lock.acquire() for cookie in self.request.cookies: jar_request.set_cookie(cookie.get_cookielib_cookie()) #Shared.th_lock.release() opener = self.urllib2_opener(self.request, jar_response) req = urllib2.Request(url=self.request.url) jar_request.add_cookie_header(req) res = opener.open(req, None, self.timeout) for cookie in jar_response: set_cookie.append(Cookie(cookie.__dict__, self.request.url)) ctype = res.info( )['Content-Type'] # @TODO !! WRONG!! (check if wrong...not sure) if ctype is not None: if ctype.lower().split(";")[0] != "text/html": opener.close() raise NotHtmlException(ERROR_CONTENTTYPE) html = res.read() opener.close() if html: html = decode_bytes(html) finder = UrlFinder(html) try: urls = finder.get_urls() except Exception as e: raise for url in urls: # @TODO handle FORMS requests.append( Request(REQTYPE_LINK, "GET", url, parent=self.request, set_cookie=set_cookie, parent_db_id=self.request.db_id)) break except RedirectException as e: set_cookie = [] for cookie in jar_response: set_cookie.append(Cookie(cookie.__dict__, self.request.url)) r = Request(REQTYPE_REDIRECT, "GET", str(e), parent=self.request, set_cookie=set_cookie, parent_db_id=self.request.db_id) requests.append(r) break except NotHtmlException: raise except Exception as e: self.retries -= 1 if self.retries == 0: raise time.sleep(self.retries_interval) return requests
def cookies_from_json(self, cookies): #return [Cookie(c, self.parent.url) for c in json.loads(cookies)] # create Cookie without "setter" because cookies loaded from db are always valid (no domain restrictions) # see Cookie.py return [Cookie(c) for c in json.loads(cookies)]
def main(self, argv): Shared.options = self.defaults Shared.th_condition = threading.Condition() Shared.main_condition = threading.Condition() deps_errors = check_dependences(self.base_dir) if len(deps_errors) > 0: print("Dependences errors: ") for err in deps_errors: print(" %s" % err) sys.exit(1) start_cookies = [] start_referer = None probe_options = ["-R", self.randstr(20)] threads = [] num_threads = self.defaults['num_threads'] out_file = "" out_file_overwrite = self.defaults['out_file_overwrite'] cookie_string = None initial_checks = True http_auth = None get_robots_txt = True save_html = False try: opts, args = getopt.getopt( argv, 'hc:t:jn:x:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:OvelE:L:Mg:') except getopt.GetoptError as err: print(str(err)) sys.exit(1) if len(args) < 2: self.usage() sys.exit(1) for o, v in opts: if o == '-h': self.usage() sys.exit(0) elif o == '-c': cookie_string = v elif o == '-C': try: with open(v) as cf: cookie_string = cf.read() except Exception as e: print("error reading cookie file") sys.exit(1) elif o == '-r': start_referer = v elif o == '-n': num_threads = int(v) elif o == '-t': Shared.options['process_timeout'] = int(v) elif o == '-q': self.display_progress = False elif o == '-A': http_auth = v elif o == '-p': try: Shared.options['proxy'] = parse_proxy_string(v) except Exception as e: print(e) sys.exit(1) elif o == '-d': for ad in v.split(","): # convert *.domain.com to *.\.domain\.com pattern = re.escape(ad).replace("\\*\\.", "((.*\\.)|)") Shared.allowed_domains.add(pattern) elif o == '-x': for eu in v.split(","): try: re.match(eu, "") except: print("* ERROR: regex failed: %s" % eu) sys.exit(1) Shared.excluded_urls.add(eu) elif o == "-G": Shared.options['group_qs'] = True elif o == "-w": out_file_overwrite = True elif o == "-R": Shared.options['max_redirects'] = int(v) elif o == "-U": Shared.options['useragent'] = v elif o == "-s": if not v in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY, CRAWLSCOPE_URL): self.usage() print("* ERROR: wrong scope set '%s'" % v) sys.exit(1) Shared.options['scope'] = v elif o == "-m": if not v in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE, CRAWLMODE_AGGRESSIVE): self.usage() print("* ERROR: wrong mode set '%s'" % v) sys.exit(1) Shared.options['mode'] = v elif o == "-S": initial_checks = False elif o == "-I": get_robots_txt = False elif o == "-H": save_html = True elif o == "-D": Shared.options['max_depth'] = int(v) elif o == "-P": Shared.options['max_post_depth'] = int(v) elif o == "-O": Shared.options['override_timeout_functions'] = False elif o == "-F": Shared.options['crawl_forms'] = False elif o == "-v": self.verbose = True elif o == "-e": Shared.options['deduplicate_pages'] = False elif o == "-l": Shared.options['headless_chrome'] = False elif o == "-M": Shared.options['simulate_real_events'] = False elif o == "-E": if not Shared.options['extra_headers']: Shared.options['extra_headers'] = {} (hn, hv) = v.split("=", 1) Shared.options['extra_headers'][hn] = hv elif o == "-L": try: with open(v) as cf: Shared.options['login_sequence'] = json.loads( cf.read()) Shared.options['login_sequence'][ "__file__"] = os.path.abspath(v) except ValueError as e: print("* ERROR: decoding login sequence") sys.exit(1) except Exception as e: print("* ERROR: login sequence file not found") sys.exit(1) elif o == "-g": if not Shared.options['local_storage']: Shared.options['local_storage'] = {} (hn, hv) = v.split("=", 1) ktks = hn.split(":", 1) if len(ktks) != 2 or ktks[0] not in ("L", "S"): print( "Error: the -g option must be in the form '[L|S]:key=value', use 'L' to set locaStorage and 'S' to set sessionStorage" ) sys.exit(1) Shared.options['local_storage'][ktks[1]] = { "type": ktks[0], "value": hv } probe_cmd = get_node_cmd() if not probe_cmd: # maybe useless print("Error: unable to find node executable") sys.exit(1) if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len( Shared.allowed_domains) > 0: print("* Warinig: option -d is valid only if scope is %s" % CRAWLSCOPE_DOMAIN) if cookie_string: try: start_cookies = parse_cookie_string(cookie_string) except Exception as e: print("error decoding cookie string") sys.exit(1) if Shared.options['mode'] != CRAWLMODE_AGGRESSIVE: probe_options.append("-f") # dont fill values if Shared.options['mode'] == CRAWLMODE_PASSIVE: probe_options.append("-t") # dont trigger events if Shared.options['proxy']: probe_options.extend([ "-y", "%s:%s:%s" % (Shared.options['proxy']['proto'], Shared.options['proxy']['host'], Shared.options['proxy']['port']) ]) if not Shared.options['headless_chrome']: probe_options.append("-l") probe_cmd.append(os.path.join(self.base_dir, 'probe', 'analyze.js')) if len(Shared.excluded_urls) > 0: probe_options.extend(("-X", ",".join(Shared.excluded_urls))) if save_html: probe_options.append("-H") probe_options.extend(("-x", str(Shared.options['process_timeout']))) probe_options.extend(("-A", Shared.options['useragent'])) if not Shared.options['override_timeout_functions']: probe_options.append("-O") if Shared.options['extra_headers']: probe_options.extend( ["-E", json.dumps(Shared.options['extra_headers'])]) if Shared.options['local_storage']: probe_options.extend( ["-g", json.dumps(Shared.options['local_storage'])]) if not Shared.options['simulate_real_events']: probe_options.append("-M") Shared.probe_cmd = probe_cmd + probe_options Shared.starturl = normalize_url(args[0]) out_file = args[1] purl = urlsplit(Shared.starturl) Shared.allowed_domains.add(purl.hostname) if Shared.options['login_sequence'] and Shared.options[ 'login_sequence']['type'] == LOGSEQTYPE_SHARED: login_req = Request(REQTYPE_LINK, "GET", Shared.options['login_sequence']['url'], set_cookie=Shared.start_cookies, http_auth=http_auth, referer=start_referer, extra_headers=Shared.options['extra_headers']) stdoutw("Logging in . . . ") try: pe = ProbeExecutor( login_req, Shared.probe_cmd + ["-z"], login_sequence=Shared.options['login_sequence']) probe = pe.execute() if not probe: print("\n* ERROR: login sequence failed to execute probe") sys.exit(1) if probe.status == "ok": for c in probe.cookies: if not Shared.options['login_sequence'][ 'cookies'] or c.name in Shared.options[ 'login_sequence']['cookies']: Shared.start_cookies.append(c) else: print("\n* ERROR: login sequence failed:\n %s" % probe.errmessage) sys.exit(1) except KeyboardInterrupt: pe.terminate() print("\nAborted") sys.exit(0) print("done") for sc in start_cookies: Shared.start_cookies.append(Cookie(sc, Shared.starturl)) start_req = Request(REQTYPE_LINK, "GET", Shared.starturl, set_cookie=Shared.start_cookies, http_auth=http_auth, referer=start_referer, extra_headers=Shared.options['extra_headers']) if not hasattr(ssl, "SSLContext"): print( "* WARNING: SSLContext is not supported with this version of python, consider to upgrade to >= 2.7.9 in case of SSL errors" ) stdoutw("Initializing . ") start_requests = self.init_crawl(start_req, initial_checks, get_robots_txt) database = None self.db_file = self.generate_filename(out_file, out_file_overwrite) try: database = self.init_db(self.db_file, out_file) except Exception as e: print(str(e)) sys.exit(1) database.save_crawl_info( htcap_version=get_program_infos()['version'], target=Shared.starturl, start_date=self.crawl_start_time, commandline=cmd_to_str(argv), user_agent=Shared.options['useragent'], proxy=json.dumps(Shared.options['proxy']), extra_headers=json.dumps(Shared.options['extra_headers']), cookies=json.dumps([x.get_dict() for x in Shared.start_cookies])) database.connect() database.begin() for req in start_requests: database.save_request(req) database.commit() database.close() print("done") print( "Database %s initialized, crawl started with %d threads (^C to pause or change verbosity)" % (self.db_file, num_threads)) for n in range(0, num_threads): thread = CrawlerThread() threads.append(thread) thread.start() self.main_loop(threads, start_requests, database) self.kill_threads(threads) self.crawl_end_time = int(time.time()) print("Crawl finished, %d pages analyzed in %d minutes" % (Shared.requests_index, (self.crawl_end_time - self.crawl_start_time) // 60)) database.save_crawl_info(end_date=self.crawl_end_time)
def main(self, args, opts): passw = None format = None out_cookies = True out_logouts = True for o, v in opts: if o == "-h": print self.usage() sys.exit(0) elif o == "-p": passw = v elif o == "-c": out_cookies = False elif o == "-l": out_logouts = False elif o in ("-H", "-J", "-A"): format = o if not passw: print "The password is hidden here BUT it will be passed to phantomjs via commandline ..." try: passw = getpass.getpass() except KeyboardInterrupt: print "\nAbort..." sys.exit(0) jspath = "%s%s%s%s" % (getrealdir(__file__), "login", os.sep, "login.js") cmd = get_phantomjs_cmd() + [jspath, args[0], args[1], passw] if len(args) > 2: cmd.append(args[2]) #print cmd_to_str(cmd) exe = CommandExecutor(cmd, True) out, err = exe.execute(20) if err: print "Unable to login" sys.exit(1) try: ret = json.loads(out) except ValueError as e: print e sys.exit(1) allcookies, logouts = ret cookies = [] if out_cookies: for c in reversed(allcookies): cookie = Cookie(c) if not cookie in cookies: cookies.append(cookie) if not out_logouts: logouts = [] if not format: print "Cookies:" for c in cookies: print " %s=%s" % (c.name, c.value) print "Logout urls:" for u in logouts: print " %s" % u elif format == "-A": for c in cookies: print cmd_to_str([c.name, c.value]) for u in logouts: print cmd_to_str([u]) elif format == "-H": args = [] if len(cookies) > 0: args = [ "-c", ";".join(["%s=%s" % (c.name, c.value) for c in cookies]) ] if len(logouts) > 0: args.extend(["-x", ",".join(logouts)]) if len(args) > 0: print cmd_to_str(args) elif format == "-J": cd = [] for c in cookies: cd.append(c.get_dict()) if out_cookies: print json.dumps(cd)
def main(self, argv): Shared.options = self.defaults Shared.th_condition = threading.Condition() Shared.main_condition = threading.Condition() deps_errors = check_dependences(self.base_dir) if len(deps_errors) > 0: print "Dependences errors: " for err in deps_errors: print " %s" % err sys.exit(1) start_cookies = [] start_referer = None probe_options = ["-R", self.randstr(20)] threads = [] num_threads = self.defaults['num_threads'] out_file = "" out_file_overwrite = self.defaults['out_file_overwrite'] cookie_string = None initial_checks = True http_auth = None get_robots_txt = True save_html = False try: opts, args = getopt.getopt( argv, 'hc:t:jn:x:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:OveLlE:') except getopt.GetoptError as err: print str(err) sys.exit(1) if len(args) < 2: self.usage() sys.exit(1) for o, v in opts: if o == '-h': self.usage() sys.exit(0) elif o == '-c': cookie_string = v elif o == '-C': try: with open(v) as cf: cookie_string = cf.read() except Exception as e: print "error reading cookie file" sys.exit(1) elif o == '-r': start_referer = v elif o == '-n': num_threads = int(v) elif o == '-t': Shared.options['process_timeout'] = int(v) elif o == '-q': self.display_progress = False elif o == '-A': http_auth = v elif o == '-p': try: Shared.options['proxy'] = parse_proxy_string(v) except Exception as e: print e sys.exit(1) elif o == '-d': for ad in v.split(","): # convert *.domain.com to *.\.domain\.com pattern = re.escape(ad).replace("\\*\\.", "((.*\\.)|)") Shared.allowed_domains.add(pattern) elif o == '-x': for eu in v.split(","): Shared.excluded_urls.add(eu) elif o == "-G": Shared.options['group_qs'] = True elif o == "-w": out_file_overwrite = True elif o == "-R": Shared.options['max_redirects'] = int(v) elif o == "-U": Shared.options['useragent'] = v elif o == "-s": if not v in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY, CRAWLSCOPE_URL): self.usage() print "* ERROR: wrong scope set '%s'" % v sys.exit(1) Shared.options['scope'] = v elif o == "-m": if not v in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE, CRAWLMODE_AGGRESSIVE): self.usage() print "* ERROR: wrong mode set '%s'" % v sys.exit(1) Shared.options['mode'] = v elif o == "-S": initial_checks = False elif o == "-I": get_robots_txt = False elif o == "-H": save_html = True elif o == "-D": Shared.options['max_depth'] = int(v) elif o == "-P": Shared.options['max_post_depth'] = int(v) elif o == "-O": Shared.options['override_timeout_functions'] = False elif o == "-F": Shared.options['crawl_forms'] = False elif o == "-v": self.verbose = True elif o == "-e": Shared.options['deduplicate_pages'] = False elif o == "-L": Shared.options['use_legacy_browser'] = True elif o == "-l": Shared.options['headless_chrome'] = False elif o == "-E": if not Shared.options['extra_headers']: Shared.options['extra_headers'] = {} (hn, hv) = v.split("=", 1) Shared.options['extra_headers'][hn] = hv probe_cmd = get_phantomjs_cmd( ) if Shared.options['use_legacy_browser'] else get_node_cmd() if not probe_cmd: # maybe useless print "Error: unable to find node (or phantomjs) executable" sys.exit(1) if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len( Shared.allowed_domains) > 0: print "* Warinig: option -d is valid only if scope is %s" % CRAWLSCOPE_DOMAIN if cookie_string: try: start_cookies = parse_cookie_string(cookie_string) except Exception as e: print "error decoding cookie string" sys.exit(1) if Shared.options['mode'] != CRAWLMODE_AGGRESSIVE: probe_options.append("-f") # dont fill values if Shared.options['mode'] == CRAWLMODE_PASSIVE: probe_options.append("-t") # dont trigger events if Shared.options['use_legacy_browser']: if Shared.options['proxy']: probe_cmd.append("--proxy-type=%s" % Shared.options['proxy']['proto']) probe_cmd.append("--proxy=%s:%s" % (Shared.options['proxy']['host'], Shared.options['proxy']['port'])) probe_cmd.append(os.path.join(self.base_dir, 'probe', 'analyze.js')) else: if Shared.options['proxy']: probe_options.extend([ "-y", "%s:%s:%s" % (Shared.options['proxy']['proto'], Shared.options['proxy']['host'], Shared.options['proxy']['port']) ]) if not Shared.options['headless_chrome']: probe_options.append("-l") probe_cmd.append( os.path.join(self.base_dir, 'probe', 'chrome-probe', 'analyze.js')) if len(Shared.excluded_urls) > 0: probe_options.extend(("-X", ",".join(Shared.excluded_urls))) if save_html: probe_options.append("-H") probe_options.extend(("-x", str(Shared.options['process_timeout']))) probe_options.extend(("-A", Shared.options['useragent'])) if not Shared.options['override_timeout_functions']: probe_options.append("-O") if Shared.options['extra_headers']: probe_options.extend( ["-E", json.dumps(Shared.options['extra_headers'])]) Shared.probe_cmd = probe_cmd + probe_options Shared.starturl = normalize_url(args[0]) out_file = args[1] purl = urlsplit(Shared.starturl) Shared.allowed_domains.add(purl.hostname) for sc in start_cookies: Shared.start_cookies.append(Cookie(sc, Shared.starturl)) start_req = Request(REQTYPE_LINK, "GET", Shared.starturl, set_cookie=Shared.start_cookies, http_auth=http_auth, referer=start_referer) if not hasattr(ssl, "SSLContext"): print "* WARNING: SSLContext is not supported with this version of python, consider to upgrade to >= 2.7.9 in case of SSL errors" stdoutw("Initializing . ") start_requests = self.init_crawl(start_req, initial_checks, get_robots_txt) database = None self.db_file = self.generate_filename(out_file, out_file_overwrite) try: database = self.init_db(self.db_file, out_file) except Exception as e: print str(e) sys.exit(1) database.save_crawl_info(htcap_version=get_program_infos()['version'], target=Shared.starturl, start_date=self.crawl_start_time, commandline=cmd_to_str(argv), user_agent=Shared.options['useragent'], proxy=json.dumps(Shared.options['proxy']), extra_headers=json.dumps( Shared.options['extra_headers']), cookies=json.dumps(start_cookies)) database.connect() database.begin() for req in start_requests: database.save_request(req) database.commit() database.close() print "done" print "Database %s initialized, crawl started with %d threads" % ( self.db_file, num_threads) for n in range(0, num_threads): thread = CrawlerThread() threads.append(thread) thread.start() self.main_loop(threads, start_requests, database) self.kill_threads(threads) self.crawl_end_time = int(time.time()) print "Crawl finished, %d pages analyzed in %d minutes" % ( Shared.requests_index, (self.crawl_end_time - self.crawl_start_time) / 60) database.save_crawl_info(end_date=self.crawl_end_time)
def main(self, argv): Shared.options = self.defaults Shared.th_condition = threading.Condition() Shared.main_condition = threading.Condition() probe_cmd = get_phantomjs_cmd() if not probe_cmd: print "Error: unable to find phantomjs executable" sys.exit(1) start_cookies = [] start_referer = None probe_options = ["-R", self.randstr(20)] threads = [] num_threads = self.defaults['num_threads'] out_file = "" out_file_overwrite = self.defaults['out_file_overwrite'] cookie_string = None display_progress = True verbose = False initial_checks = True http_auth = None get_robots_txt = True save_html = False user_script = None try: opts, args = getopt.getopt( argv, 'hc:t:jn:x:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:Ovu:') except getopt.GetoptError as err: print str(err) sys.exit(1) if len(args) < 2: self.usage() sys.exit(1) for o, v in opts: if o == '-h': self.usage() sys.exit(0) elif o == '-c': cookie_string = v elif o == '-C': try: with open(v) as cf: cookie_string = cf.read() except Exception as e: print "error reading cookie file" sys.exit(1) elif o == '-r': start_referer = v elif o == '-n': num_threads = int(v) elif o == '-t': Shared.options['process_timeout'] = int(v) elif o == '-q': display_progress = False elif o == '-A': http_auth = v elif o == '-p': if v == "tor": v = "socks5:127.0.0.1:9150" proxy = v.split(":") if proxy[0] not in ("http", "socks5"): print "only http and socks5 proxies are supported" sys.exit(1) Shared.options['proxy'] = { "proto": proxy[0], "host": proxy[1], "port": proxy[2] } elif o == '-d': for ad in v.split(","): # convert *.domain.com to *.\.domain\.com pattern = re.escape(ad).replace("\\*\\.", "((.*\\.)|)") Shared.allowed_domains.add(pattern) elif o == '-x': for eu in v.split(","): Shared.excluded_urls.add(eu) elif o == "-G": Shared.options['group_qs'] = True elif o == "-w": out_file_overwrite = True elif o == "-R": Shared.options['max_redirects'] = int(v) elif o == "-U": Shared.options['useragent'] = v elif o == "-s": if not v in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY, CRAWLSCOPE_URL): self.usage() print "* ERROR: wrong scope set '%s'" % v sys.exit(1) Shared.options['scope'] = v elif o == "-m": if not v in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE, CRAWLMODE_AGGRESSIVE): self.usage() print "* ERROR: wrong mode set '%s'" % v sys.exit(1) Shared.options['mode'] = v elif o == "-S": initial_checks = False elif o == "-I": get_robots_txt = False elif o == "-H": save_html = True elif o == "-D": Shared.options['max_depth'] = int(v) elif o == "-P": Shared.options['max_post_depth'] = int(v) elif o == "-O": Shared.options['override_timeout_functions'] = False elif o == "-F": Shared.options['crawl_forms'] = False elif o == "-v": verbose = True elif o == "-u": if os.path.isfile(v): user_script = os.path.abspath(v) else: print "error: unable to open USER_SCRIPT" sys.exit(1) if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len( Shared.allowed_domains) > 0: print "* Warinig: option -d is valid only if scope is %s" % CRAWLSCOPE_DOMAIN if cookie_string: try: start_cookies = self.parse_cookie_string(cookie_string) except Exception as e: print "error decoding cookie string" sys.exit(1) if Shared.options['mode'] != CRAWLMODE_AGGRESSIVE: probe_options.append("-f") # dont fill values if Shared.options['mode'] == CRAWLMODE_PASSIVE: probe_options.append("-t") # dont trigger events if Shared.options['proxy']: probe_cmd.append("--proxy-type=%s" % Shared.options['proxy']['proto']) probe_cmd.append("--proxy=%s:%s" % (Shared.options['proxy']['host'], Shared.options['proxy']['port'])) probe_cmd.append(self.base_dir + 'probe/analyze.js') if len(Shared.excluded_urls) > 0: probe_options.extend(("-X", ",".join(Shared.excluded_urls))) if save_html: probe_options.append("-H") if user_script: probe_options.extend(("-u", user_script)) probe_options.extend(("-x", str(Shared.options['process_timeout']))) probe_options.extend(("-A", Shared.options['useragent'])) if not Shared.options['override_timeout_functions']: probe_options.append("-O") Shared.probe_cmd = probe_cmd + probe_options Shared.starturl = normalize_url(args[0]) out_file = args[1] purl = urlsplit(Shared.starturl) Shared.allowed_domains.add(purl.hostname) for sc in start_cookies: Shared.start_cookies.append(Cookie(sc, Shared.starturl)) start_req = Request(REQTYPE_LINK, "GET", Shared.starturl, set_cookie=Shared.start_cookies, http_auth=http_auth, referer=start_referer) if not hasattr(ssl, "SSLContext"): print "* WARNING: SSLContext is not supported with this version of python, consider to upgrade to >= 2.7.9 in case of SSL errors" stdoutw("Initializing . ") if user_script and initial_checks: self.check_user_script_syntax(probe_cmd, user_script) start_requests = self.init_crawl(start_req, initial_checks, get_robots_txt) database = None fname = self.generate_filename(out_file, out_file_overwrite) try: database = self.init_db(fname, out_file) except Exception as e: print str(e) sys.exit(1) database.save_crawl_info(htcap_version=get_program_infos()['version'], target=Shared.starturl, start_date=self.crawl_start_time, commandline=cmd_to_str(argv), user_agent=Shared.options['useragent']) database.connect() database.begin() for req in start_requests: database.save_request(req) database.commit() database.close() print "done" print "Database %s initialized, crawl started with %d threads" % ( fname, num_threads) for n in range(0, num_threads): thread = CrawlerThread() threads.append(thread) thread.start() self.main_loop(threads, start_requests, database, display_progress, verbose) self.kill_threads(threads) self.crawl_end_time = int(time.time()) print "Crawl finished, %d pages analyzed in %d minutes" % ( Shared.requests_index, (self.crawl_end_time - self.crawl_start_time) / 60) database.save_crawl_info(end_date=self.crawl_end_time)
def send_request(self, method=None, url=None, data=None, cookies=None, ignore_errors=False): # Shared.options['process_timeout'] if not method: method = self.request.method if not url: url = self.request.url if method == "POST": if not data: data = self.request.data if self.request.data else "" if not cookies: cookies = [] jar_request = cookielib.LWPCookieJar() ret = { "code": None, "url": None, "headers": None, "body": None, "time": None } while True: try: existing_cookies = [] for cookie in self.request.cookies: clc = cookie.get_cookielib_cookie() for c in cookies: if c['name'] == cookie.name: clc.value = c['value'] existing_cookies.append(c) jar_request.set_cookie(clc) for cookie in [ x for x in cookies if x not in existing_cookies ]: c = Cookie(cookie) # check what to do with cookie.setter jar_request.set_cookie(c.get_cookielib_cookie()) opener = self.urllib2_opener(self.request, None, True) req = urllib2.Request(url=url, data=data) jar_request.add_cookie_header(req) if self.extra_headers: for hn in self.extra_headers: req.add_header(hn, self.extra_headers[hn]) now = time.time() try: res = opener.open(req, None, self.timeout) except urllib2.HTTPError as e: if not ignore_errors: raise res = e opener.close() ret['code'] = res.getcode() ret['url'] = res.geturl() ret['headers'] = [x.strip() for x in res.info().headers] ret['body'] = res.read() ret['time'] = time.time() - now break except Exception as e: self.retries -= 1 if self.retries == 0: raise time.sleep(self.retries_interval) return ret
def run(self): # get database try: database = self._get_database(self._outfile_name, self._output_mode) crawl_id = database.save_crawl_info( htcap_version=get_program_infos()['version'], target=Shared.start_url, start_date=self.crawl_start_date, commandline=cmd_to_str(self.arg), user_agent=Shared.options['user_agent'], start_cookies=Shared.start_cookies) # if the current crawl is not the first one if crawl_id > 1: # retrieving options from the last crawl random_seed, cookies = database.retrieve_crawl_info(crawl_id - 1) # if the db had a seed and none were provided before if random_seed and not Shared.options.get("random_seed"): Shared.options["random_seed"] = random_seed # if no cookie was provided and some exist from the last crawl if len(Shared.start_cookies ) <= 0 and cookies != "[]" and cookies is not None: for cookie_string in self._parse_cookie_string(cookies): Shared.start_cookies.append(Cookie(cookie_string)) # if no seed have been set yet if not Shared.options.get("random_seed"): Shared.options["random_seed"] = self._generate_random_string( 20) except Exception as e: print(str(e)) sys.exit(1) # set probe arguments self._set_probe() Shared.probe_cmd = self._probe["cmd"] + self._probe["options"] start_requests = [] # create the start request object from provided arguments start_request_from_args = Request(REQTYPE_LINK, "GET", Shared.start_url, set_cookie=Shared.start_cookies, http_auth=self._http_auth, referer=self._start_referer) def _is_not_in_past_requests(request): """ check if the given request is present in Shared.requests or start_requests """ is_in_request = True for r in Shared.requests + start_requests: if r == request: is_in_request = False return is_in_request # check starting url if self._initial_checks: try: self._check_request(start_request_from_args) stdoutw(". ") except KeyboardInterrupt: print("\nAborted") sys.exit(0) if self._output_mode in (CRAWLOUTPUT_RESUME, CRAWLOUTPUT_COMPLETE): try: # make the start url given in arguments crawlable again database.connect() database.save_request(start_request_from_args) database.make_request_crawlable(start_request_from_args) database.commit() database.close() # feeding the "done" request list from the db Shared.requests.extend(database.get_crawled_request()) Shared.requests_index = len(Shared.requests) # if resume, add requests from db if self._output_mode == CRAWLOUTPUT_RESUME: start_requests.extend(database.get_not_crawled_request()) # if request from args is neither in past or future requests if _is_not_in_past_requests(start_request_from_args): start_requests.append(start_request_from_args) except Exception as e: print(str(e)) sys.exit(1) else: start_requests.append(start_request_from_args) # retrieving robots.txt content if self._get_robots_txt: try: start_requests.extend( filter( _is_not_in_past_requests, self._get_requests_from_robots( start_request_from_args))) except KeyboardInterrupt: print("\nAborted") sys.exit(0) # save starting request to db database.connect() database.begin() for req in start_requests: database.save_request(req) database.commit() database.close() print( "\nDone: {} starting url(s) and {} url(s) already crawled".format( len(start_requests), len(Shared.requests))) # starting crawling threads print("Database %s initialized, crawl starting with %d threads" % (database, self._num_threads)) for n in range(0, self._num_threads): thread = CrawlerThread() self._threads.append(thread) thread.start() # running crawl loop self._main_loop(self._threads, start_requests, database, self._display_progress, self._verbose) self._kill_threads(self._threads) self.crawl_end_date = int(time.time()) print("Crawl finished, %d pages analyzed in %d minutes" % (Shared.requests_index, (self.crawl_end_date - self.crawl_start_date) / 60)) # update end date in db database.update_crawl_info(crawl_id, self.crawl_end_date, Shared.options["random_seed"], Shared.end_cookies)
def _setup_shared(self): """ instantiate crawler, probe and start the crawling loop :param argv: """ Shared.options = self._defaults # initialize shared options # initialize threads conditions Shared.th_condition = threading.Condition() Shared.main_condition = threading.Condition() # validate probe presence if not self._probe["cmd"]: print("Error: unable to find probe") sys.exit(1) # retrieving user arguments try: opts, args = getopt.getopt( self.arg, 'ho:qvm:s:D:P:Fd:c:C:r:x:p:n:A:U:t:SGNR:IOKe:') except getopt.GetoptError as err: print(str(err)) self._usage() sys.exit(1) if len(args) < 2: # if no start url and file name self._usage() print('* Error: missing url and/or outfile') sys.exit(1) for o, v in opts: if o == '-h': # help self._usage() sys.exit(0) elif o == '-c': # cookie string self._cookie_string = v elif o == '-C': # cookie file try: with open(v) as cf: self._cookie_string = cf.read() except Exception as e: print("* Error reading cookie file: {}".format(str(e))) sys.exit(1) elif o == '-r': # start referrer self._start_referer = v elif o == '-n': # number of threads self._num_threads = int(v) elif o == '-t': # time out Shared.options['process_timeout'] = int(v) elif o == '-q': # quiet self._display_progress = False elif o == '-A': # authentication self._http_auth = v elif o == '-p': # proxy proxy = v.split(":") if proxy[0] not in ("http", "socks5"): print( "* Error: only http and socks5 proxies are supported") sys.exit(1) Shared.options['proxy'] = { "proto": proxy[0], "host": proxy[1], "port": proxy[2] } elif o == '-d': # allowed domains for ad in v.split(","): # convert *.domain.com to *.\.domain\.com pattern = re.escape(ad).replace("\\*\\.", "((.*\\.)|)") Shared.allowed_domains.add(pattern) elif o == '-x': # excluded urls for eu in v.split(","): Shared.excluded_urls.add(eu) elif o == "-G": Shared.options['group_qs'] = True elif o == "-o": # output file mode if v not in (CRAWLOUTPUT_OVERWRITE, CRAWLOUTPUT_RENAME, CRAWLOUTPUT_RESUME, CRAWLOUTPUT_COMPLETE): self._usage() print("* Error: wrong output mode set '%s'\n" % v) sys.exit(1) self._output_mode = v elif o == "-R": # redirects limit Shared.options['max_redirects'] = int(v) elif o == "-U": # user agent Shared.options['user_agent'] = v elif o == "-s": # crawl scope if v not in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY, CRAWLSCOPE_URL): self._usage() print("* ERROR: wrong scope set '%s'" % v) sys.exit(1) Shared.options['scope'] = v elif o == "-m": # crawl mode if v not in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE, CRAWLMODE_AGGRESSIVE): self._usage() print("* ERROR: wrong mode set '%s'" % v) sys.exit(1) Shared.options['mode'] = v elif o == "-S": # skip initial checks self._initial_checks = False elif o == "-I": # ignore robots.txt self._get_robots_txt = False elif o == "-D": # crawling depth Shared.options['max_depth'] = int(v) elif o == "-P": # crawling depth for forms Shared.options['max_post_depth'] = int(v) elif o == "-O": # do not override javascript timeout Shared.options['override_timeout_functions'] = False elif o == "-F": # do not crawl forms Shared.options['crawl_forms'] = False elif o == "-v": # verbose self._verbose = True elif o == "-e": # seed for random value Shared.options["random_seed"] = v # warn about -d option in domain scope mode if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len( Shared.allowed_domains) > 0: print("* Warning: option -d is valid only if scope is %s" % CRAWLSCOPE_DOMAIN) # initialize cookies if self._cookie_string: try: start_cookies = self._parse_cookie_string(self._cookie_string) for cookie in start_cookies: Shared.start_cookies.append( Cookie(cookie, Shared.start_url)) except Exception as e: print("error decoding cookie string: {}".format(str(e))) sys.exit(1) # retrieve start url and output file arguments Shared.start_url = normalize_url(args[0]) self._outfile_name = args[1] # add start url domain to allowed domains purl = urlsplit(Shared.start_url) Shared.allowed_domains.add(purl.hostname) # warn about ssl context in python 2 if not hasattr(ssl, "SSLContext"): print( "* WARNING: SSLContext is not supported with this version of python," " consider to upgrade to >= 2.7.9 in case of SSL errors")