def attr_from_script(scriptname, attr): try: path = "%s.%s" % (SCRIPTS_NAME, scriptname) __import__(path) module = sys.modules[path] try: attr = getattr(module, attr) return attr except AttributeError: ERROR("AttributeError,path:%s has no %s attribute" % (path, attr)) except ImportError: ERROR("ImportError,path:%s" % path)
def runDomain(self, rule_id, run_domain, filename, risk): #DEBUG("ScanEngine run_domain pool size:%s" % len(self.pool)) result = None try: self.update_progress(rule_id) result = run_domain(self.ruleattr) except Exception: ERROR("rule_id:%s,scriptname:%s exception" % (rule_id, filename)) if result is not None: try: self.analyse_result(rule_id, risk, result, self.domain) except Exception: ERROR('analyse_result exception,rule_id:%s' % rule_id)
def update_sitemap_parsed(self, action): flag = {'start': 0, 'finish': 1}.get(action, 1) sql = "UPDATE task SET `sitemap_parsed`=%s where id=%s" try: db.execute(sql, flag, self.task_id) except Exception: ERROR('update_sitemap_parsed exception,task_id:%s' % self.task_id)
def update_spider_flag(self, action): flag = {'start': 1, 'finish': 3}.get(action, 1) sql = "UPDATE task SET `spider_flag`=%s where id=%s" try: db.execute(sql, flag, self.task_id) except Exception: ERROR('update_spider_flag exception,task_id:%s' % self.task_id)
def get_exist_url(self): sql = "SELECT * FROM url WHERE task_id=%s" urls = None try: urls = [url for url in db.iter(sql, self.task_id)] except Exception: ERROR('get_url exception,task_id:%s' % self.task_id) return urls or []
def update_url_end_time(self, request): if request.id is not None: sql = "UPDATE url SET `end_time`=%s WHERE id=%s" try: db.execute(sql, datetime.now(), request.id) except Exception: ERROR('update_url_end_time exception,task_id:%s' % self.task_id)
def main(): try: parseCmdline() init() run() #user_test() except KeyboardInterrupt: INFO("User aborted,scan stop") except DestinationUnReachable as e: WARN("Destination:%s not reachable,please check" % e.dest) except TopException: ERROR("User define exception") except Exception as e: ERROR("Exception occur,scan stop") finally: task_finsh_clean() INFO("Scan finished!")
def start(cls, request, schedule): try: spider = cls(request, schedule) #DEBUG(currentName() + ' start') spider.run(request) except Exception, e: ERROR('Spider.start Exception')
def mkdir(path, remove=True): if os.path.isdir(path): if remove: try: shutil.rmtree(path) os.mkdir(path) except Exception: ERROR("rmtree except,path" + path) else: os.mkdir(path)
def update_progress(self, rule_id): try: sql = "SELECT `progress` FROM task WHERE id=%s" % self.task_id progress = db.get(sql).progress if rule_id not in progress.split('|'): progress += '|%s' % rule_id sql = "UPDATE task SET `progress`='%s' WHERE id=%s" % ( progress, self.task_id) db.execute(sql) except Exception: ERROR("update_progress exception")
def pipeline(request): try: data = [ getattr(request, attr).encode('utf-8') for attr in ('url', 'method', 'params', 'referer') ] sql_c = "SELECT COUNT(1) as `c` FROM %s" % (URL_TABLE) # sql_c += " WHERE `task_id`='%s' and `url`='%s' and `method`='%s' and `params`='%s'" # sql_c = sql_c % (conf.taskid, data[0], data[1], data[2]) sql_c += " WHERE `task_id`=%s and `url`=%s and `method`=%s and `params`=%s" if db.get(sql_c, conf.taskid, data[0], data[1], data[2]).c > 0: return sql = "INSERT INTO %s" % (URL_TABLE) sql += "(`task_id`,`url`,`method`,`params`,`referer`,`start_time`) VALUES(%s,%s,%s,%s,%s,%s)" data.append(datetime.now()) return db.execute(sql, conf.taskid, *data) except Exception: ERROR("Crawler.pipeline Exception")
def request(url, **kwargs): """ quick start: http://blog.csdn.net/iloveyin/article/details/21444613 http://www.zhidaow.com/post/python-requests-install-and-brief-introduction :param method: method for the new :class:`Request` object. :param params: (optional) Dictionary or bytes to be sent in the query string for the :class:`Request`. :param data: (optional) Dictionary, bytes, or file-like object to send in the body of the :class:`Request`. :param headers: (optional) Dictionary of HTTP Headers to send with the :class:`Request`. :param cookies: (optional) Dict or CookieJar object to send with the :class:`Request`. :param files: (optional) Dictionary of 'name': file-like-objects (or {'name': ('filename', fileobj)}) for multipart encoding upload. :param auth: (optional) Auth tuple to enable Basic/Digest/Custom HTTP Auth. :param timeout: (optional) Float describing the timeout of the request in seconds. :param allow_redirects: (optional) Boolean. Set to True if POST/PUT/DELETE redirect following is allowed. :param proxies: (optional) Dictionary mapping protocol to the URL of the proxy. :param verify: (optional) if ``True``, the SSL cert will be verified. A CA_BUNDLE path can also be provided. :param stream: (optional) if ``False``, the response content will be immediately downloaded. :param cert: (optional) if String, path to ssl client cert file (.pem). If Tuple, ('cert', 'key') pair. :param encode: (optional),if none,return unicode,else return encode str """ def checkCharset(response): if response.encoding == "ISO-8859-1": #requests default header encoding encoding = requests.utils.get_encodings_from_content( response.content) if encoding: response.encoding = encoding[0] return response kwargs.setdefault('headers', {}) kwargs.setdefault('timeout', conf.timeout) method = kwargs.pop('method') if kwargs.has_key( 'method') else DEFAULT_METHOD decode = kwargs.pop('decode') if kwargs.has_key('decode') else None if conf.cookie: kwargs.setdefault('cookies', conf.cookie) if method.upper() in ("GET", "POST", "options"): kwargs.setdefault('allow_redirects', True) else: kwargs.setdefault('allow_redirects', False) # key = cacheFileName(url,method,kwargs) # exist = os.path.isfile(key) # try: # if exist: # DEBUG("%s in cache" % url) # res = FileResponse(key,url,method,**kwargs).load() # if res: # return checkCharset(res) # except IOError,e: # ERROR("cache file read exception,url:%s,method:%s,kwargs:%s" %(url,method,str(kwargs))) h = [k.title() for k in kwargs['headers'].iterkeys()] kwargs['headers'].update( dict([(k, v) for k, v in HEADERS.iteritems() if k not in h])) try: response = requests.request(method, url, **kwargs) # try: # FileResponse(key,url,method,**kwargs).store(response) # except IOError,e: # ERROR("cache file write exception,url:%s,method:%s,kwargs:%s" %(url,method,str(kwargs))) response = checkCharset(response) if decode: _ = response.text assert isinstance(_, unicode) try: _e = _.encode(decode) except UnicodeEncodeError: ERROR("encodePage error,charset:%s,url:%s" % (response.encoding, url)) _e = _.encode(decode, 'replace') response.text_encoded = _e return response except Exception, e: ERROR("request exception,url:" + url)
def update_end_time(task_id): sql = "UPDATE task SET `end_time`=%s WHERE id=%s" try: db.execute(sql, datetime.now(), task_id) except Exception: ERROR('update_end_time failed,task_id:%s,please check' % task_id)
def update_task_status(task_id): sql = "UPDATE task SET `status`=3 WHERE id=%s" % task_id try: db.execute(sql) except Exception: ERROR('update_task_status failed,task_id:%s,please check' % task_id)
def set_unreachable_flag(task_id): sql = "UPDATE task SET `reachable`=0 WHERE id=%s" % task_id try: db.execute(sql) except Exception: ERROR('set_unreachable failed,task_id:%s,please check' % task_id)