Python HTTPConnPool 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: lib.connectionPool

클래스/타입: HTTPConnPool

hotexamples.com에서의 예제들: 6

Python HTTPConnPool - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 lib.connectionPool.HTTPConnPool에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

HTTPConnPool(3)

close(3)

urlopen(3)

자주 사용되는 메소드들

HTTPConnPool (3)

close (3)

urlopen (3)

예제 #1

파일 보기

파일: BBScan.py 프로젝트: sry309/BBScan

    def init_final(self):
        try:
            if self.conn_pool:
                self.conn_pool.close()
        except Exception as e:
            pass

        if self.scheme == 'http' and self.port == 80 or self.scheme == 'https' and self.port == 443:
            self.base_url = '%s://%s' % (self.scheme, self.host)
        else:
            self.base_url = '%s://%s:%s' % (self.scheme, self.host, self.port)

        if self.has_http:
            self.print_msg('Scan %s' % self.base_url)
        else:
            self.print_msg('Scan %s:%s' %
                           (self.host, self.port) if self.port else 'Scan %s' %
                           self.host)

        if self.has_http:
            if self.scheme == 'https':
                self.conn_pool = HTTPSConnPool(self.host,
                                               port=self.port,
                                               maxsize=self.args.t,
                                               headers=config.default_headers)
            else:
                self.conn_pool = HTTPConnPool(self.host,
                                              port=self.port,
                                              maxsize=self.args.t,
                                              headers=config.default_headers)
            if self.args.require_index_doc:
                self.crawl('/', do_not_process_links=True)

        if self.no_scripts != 1:  # 不是重复目标 80 443 跳转的，不需要重复扫描
            # 当前目标disable， 或者 全局开启插件扫描
            if self.args.scripts_only or not self.no_scripts:
                for _ in self.user_scripts:
                    self.url_queue.put((_, '/'))

        if not self.has_http or self.args.scripts_only:  # 未发现HTTP服务 或  只依赖插件扫描
            return

        self.max_depth = cal_depth(self, self.path)[1] + 5
        if self.args.no_check404:
            self._404_status = 404
        else:
            self.check_404_existence()
        if self._404_status == -1:
            self.print_msg('[Warning] HTTP 404 check failed <%s:%s>' %
                           (self.host, self.port))
        elif self._404_status != 404:
            self.print_msg('[Warning] %s has no HTTP 404.' % self.base_url)
        _path, _depth = cal_depth(self, self.path)

        self.enqueue('/')
        if _path != '/' and not self.log_file:
            self.enqueue(_path)

예제 #2

파일 보기

    def init_final(self):
        try:
            if self.conn_pool:
                self.conn_pool.close()
        except Exception as e:
            pass
        default_port = 443 if self.schema.lower() == 'https' else 80
        self.host, self.port = self.host.split(
            ':') if self.host.find(':') > 0 else (self.host, default_port)
        self.port = int(self.port)
        if self.schema == 'http' and self.port == 80 or self.schema == 'https' and self.port == 443:
            self.base_url = '%s://%s' % (self.schema, self.host)
        else:
            self.base_url = '%s://%s:%s' % (self.schema, self.host, self.port)

        is_port_open = self.is_port_open()
        if is_port_open:
            if self.schema == 'https':
                self.conn_pool = HTTPSConnPool(self.host,
                                               port=self.port,
                                               maxsize=self.args.t * 2,
                                               headers=HEADERS)
            else:
                self.conn_pool = HTTPConnPool(self.host,
                                              port=self.port,
                                              maxsize=self.args.t * 2,
                                              headers=HEADERS)

        if self.args.scripts_only or (not is_port_open
                                      and not self.args.no_scripts):
            for _ in self.user_scripts:
                self.url_queue.put((_, '/'))
            print_msg('Scan with scripts: %s' % self.host)
            return

        if not is_port_open:
            return

        self.max_depth = cal_depth(self, self.path)[1] + 5
        if self.args.no_check404:
            self._404_status = 404
            self.has_status_404 = True
        else:
            self.check_404_existence()
        if self._404_status == -1:
            print_msg('[Warning] HTTP 404 check failed <%s:%s>' %
                      (self.host, self.port))
        elif not self.has_status_404:
            print_msg('[Warning] %s has no HTTP 404.' % self.base_url)
        _path, _depth = cal_depth(self, self.path)
        self.enqueue('/')
        self.enqueue(_path)
        if not self.args.no_crawl and not self.log_file:
            self.crawl(_path)

예제 #3

파일 보기

파일: BBScan.py 프로젝트: 5alt/BBScan

    def init_final(self):
        try:
            self.conn_pool.close()
        except:
            pass
        default_port = 443 if self.schema.lower() == 'https' else 80
        self.host, self.port = self.host.split(
            ':') if self.host.find(':') > 0 else (self.host, default_port)
        self.port = int(self.port)
        if self.schema == 'http' and self.port == 80 or self.schema == 'https' and self.port == 443:
            self.base_url = '%s://%s' % (self.schema, self.host)
        else:
            self.base_url = '%s://%s:%s' % (self.schema, self.host, self.port)

        is_port_open = self.is_port_open()
        if is_port_open:
            if self.schema == 'https':
                self.conn_pool = HTTPSConnPool(self.host,
                                               port=self.port,
                                               maxsize=self.args.t * 2,
                                               headers=headers)
            else:
                self.conn_pool = HTTPConnPool(self.host,
                                              port=self.port,
                                              maxsize=self.args.t * 2,
                                              headers=headers)

        if not is_port_open:
            return

        self.max_depth = cal_depth(self, self.path)[1] + 5
        if self.args.no_check404:
            self._404_status = 404
            self.has_404 = True
        else:
            self.check_404()  # check existence of HTTP 404
        if not self.has_404:
            print_msg('[Warning] %s has no HTTP 404.' % self.host)

        self.request_index(self.path)
        self.gather_info()

        _path, _depth = cal_depth(self, self.path)
        self._enqueue('/')
        self._enqueue(_path)
        if not self.args.no_crawl and not self.log_file:
            self.crawl_index()

예제 #4

파일 보기

파일: BBScan.py 프로젝트: 5alt/BBScan

class InfoDisScanner(object):
    def __init__(self, timeout=600, args=None):
        self.args = args
        self.START_TIME = time.time()
        self.TIME_OUT = timeout
        self.LINKS_LIMIT = 100  # max number of Folders to scan

        self.full_scan = args.full_scan
        self._init_rules()
        self._init_scripts()

        self.url_queue = Queue.Queue()  # all urls to scan
        self.urls_processed = set()  # processed urls
        self.urls_enqueued = set()  # entered queue urls

        self.lock = threading.Lock()

    # reset scanner
    def init_reset(self):
        self.START_TIME = time.time()
        self.url_queue.queue.clear()
        self.urls_processed = set()
        self.urls_enqueued = set()
        self.index_a_urls = set()
        self.scripts_enqueued = set()
        self.results = {}
        self.log_file = None
        self._404_status = -1
        self.conn_pool = None
        self.index_status, self.index_headers, self.index_html_doc = None, {}, ''
        self.rewrite = False
        self.server = ''
        self.lang = ''

    # scan from a given URL
    def init_from_url(self, url):
        self.init_reset()
        if not url.find('://') > 0:
            self.url = 'http://' + url
        else:
            self.url = url
        self.schema, self.host, self.path = parse_url(url)
        self.domain_sub = get_domain_sub(self.host)
        self.init_final()

    def init_from_log_file(self, log_file):
        self.init_reset()
        self.log_file = log_file
        self.schema, self.host, self.path = self._parse_url_from_file()
        self.domain_sub = get_domain_sub(self.host)
        if self.host:
            self.load_all_urls_from_log_file()
            self.init_final()
        else:
            self.init_from_url(os.path.basename(log_file).replace('.log', ''))

    #
    def init_final(self):
        try:
            self.conn_pool.close()
        except:
            pass
        default_port = 443 if self.schema.lower() == 'https' else 80
        self.host, self.port = self.host.split(
            ':') if self.host.find(':') > 0 else (self.host, default_port)
        self.port = int(self.port)
        if self.schema == 'http' and self.port == 80 or self.schema == 'https' and self.port == 443:
            self.base_url = '%s://%s' % (self.schema, self.host)
        else:
            self.base_url = '%s://%s:%s' % (self.schema, self.host, self.port)

        is_port_open = self.is_port_open()
        if is_port_open:
            if self.schema == 'https':
                self.conn_pool = HTTPSConnPool(self.host,
                                               port=self.port,
                                               maxsize=self.args.t * 2,
                                               headers=headers)
            else:
                self.conn_pool = HTTPConnPool(self.host,
                                              port=self.port,
                                              maxsize=self.args.t * 2,
                                              headers=headers)

        if not is_port_open:
            return

        self.max_depth = cal_depth(self, self.path)[1] + 5
        if self.args.no_check404:
            self._404_status = 404
            self.has_404 = True
        else:
            self.check_404()  # check existence of HTTP 404
        if not self.has_404:
            print_msg('[Warning] %s has no HTTP 404.' % self.host)

        self.request_index(self.path)
        self.gather_info()

        _path, _depth = cal_depth(self, self.path)
        self._enqueue('/')
        self._enqueue(_path)
        if not self.args.no_crawl and not self.log_file:
            self.crawl_index()

    def is_port_open(self):
        try:
            s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            s.settimeout(5.0)
            if s.connect_ex((self.host, int(self.port))) == 0:
                self.lock.acquire()
                print_msg('Scan web: %s' % self.base_url)
                self.lock.release()
                return True
            else:
                print_msg('[Warning] Fail to connect to %s:%s' %
                          (self.host, self.port))
                return False
        except Exception as e:
            return False
        finally:
            s.setsockopt(socket.SOL_SOCKET, socket.SO_LINGER,
                         struct.pack('ii', 1, 0))
            s.close()

    #
    def _parse_url_from_file(self):
        url = ''
        with open(self.log_file) as infile:
            for line in infile.xreadlines():
                line = line.strip()
                if line and len(line.split()) >= 3:
                    url = line.split()[1]
                    break
        return parse_url(url)

    def _load_rules(self, rule_file):
        rules = []
        p_tag = re.compile('{tag="(.*?)"}')
        p_status = re.compile('{status=(\d{3})}')
        p_content_type = re.compile('{type="(.*?)"}')
        p_content_type_no = re.compile('{type_no="(.*?)"}')
        p_lang = re.compile('{lang="(.*?)"}')
        with open(rule_file, 'r') as infile:
            for url in infile.xreadlines():
                url = url.strip()
                if url.startswith('/'):
                    _ = p_tag.search(url)
                    tag = _.group(1) if _ else ''

                    _ = p_status.search(url)
                    status = int(_.group(1)) if _ else 0

                    _ = p_content_type.search(url)
                    content_type = _.group(1) if _ else ''

                    _ = p_content_type_no.search(url)
                    content_type_no = _.group(1) if _ else ''

                    _ = p_lang.search(url)
                    lang = _.group(1) if _ else ''

                    root_only = True if url.find('{root_only}') >= 0 else False

                    rewrite = True if url.find('{rewrite}') >= 0 else False

                    rule = (url.split()[0], tag, status, content_type,
                            content_type_no, root_only, lang, rewrite)
                    rules.append(rule)
        return rules

    #
    # load urls from rules/*.txt
    def _init_rules(self):
        self.text_to_find = []
        self.regex_to_find = []
        self.text_to_exclude = []
        self.regex_to_exclude = []
        self.rules_set = set()

        for rule_file in glob.glob('rules/*.txt'):
            rules = self._load_rules(rule_file)
            for rule in rules:
                if rule not in self.rules_set:
                    self.rules_set.add(rule)
                else:
                    print 'Dumplicated Rule:', rule

        re_text = re.compile('{text="(.*)"}')
        re_regex_text = re.compile('{regex_text="(.*)"}')

        _file_path = 'rules/white.list'
        if not os.path.exists(_file_path):
            return
        for line in open(_file_path):
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            _m = re_text.search(line)
            if _m:
                self.text_to_find.append(_m.group(1).decode('utf-8', 'ignore'))
            else:
                _m = re_regex_text.search(line)
                if _m:
                    self.regex_to_find.append(
                        re.compile(_m.group(1).decode('utf-8', 'ignore')))

        _file_path = 'rules/black.list'
        if not os.path.exists(_file_path):
            return
        for line in open(_file_path):
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            _m = re_text.search(line)
            if _m:
                self.text_to_exclude.append(
                    _m.group(1).decode('utf-8', 'ignore'))
            else:
                _m = re_regex_text.search(line)
                if _m:
                    self.regex_to_exclude.append(
                        re.compile(_m.group(1).decode('utf-8', 'ignore')))

    #
    def _init_scripts(self):
        self.user_scripts = []
        if self.args.no_scripts:  # disable user scripts scan
            return
        for _script in glob.glob('scripts/*.py'):
            script_name = os.path.basename(_script).replace('.py', '')
            if script_name.startswith('_'):
                continue
            try:
                _ = importlib.import_module('scripts.%s' % script_name)
                self.user_scripts.append(_)
            except Exception as e:
                print e

    #
    def _http_request(self, url, timeout=30):
        try:
            if not url:
                url = '/'
            # print 'request', self.base_url + url
            resp = self.conn_pool.urlopen('GET',
                                          self.base_url + url,
                                          redirect=False,
                                          timeout=timeout,
                                          retries=0)
            resp_headers = resp.headers
            status = resp.status
            if resp_headers.get('content-type', '').find('text') >= 0 \
                    or resp_headers.get('content-type', '').find('html') >= 0 \
                    or int(resp_headers.get('content-length', '0')) <= 20480:  # 1024 * 20
                html_doc = decode_response_text(resp.data)
            else:
                html_doc = ''

            return status, resp_headers, html_doc
        except Exception as e:
            return -1, {}, ''

    #
    def check_404(self):
        try:
            try:
                self._404_status, headers, html_doc = self._http_request(
                    '/BBScan-404-existence-check')
            except:
                self._404_status, headers, html_doc = -1, {}, ''

            self.has_404 = (self._404_status == 404)
            if not self.has_404:
                self.len_404_doc = len(html_doc)
            return self.has_404
        except Exception as e:
            logging.error('[Check_404] Exception %s' % str(e))

    def _enqueue_request(self, prefix, full_url, rule):
        if self.args.scripts_only:
            return
        if full_url in self.urls_enqueued:
            return
        url_description = {'prefix': prefix, 'full_url': full_url}
        item = (url_description, rule[1], rule[2], rule[3], rule[4], rule[5],
                rule[6], rule[7])
        self.url_queue.put(item)
        self.urls_enqueued.add(full_url)

    def _enqueue_script(self, module, prefix):
        if self.args.no_scripts:
            return
        if not prefix: prefix = '/'
        if (module.__name__, prefix) in self.scripts_enqueued: return
        self.url_queue.put((module, prefix))
        self.scripts_enqueued.add((module.__name__, prefix))

    #
    def _enqueue(self, url):
        try:
            url = str(url)
            url_pattern = re.sub('\d+', '{num}', url)
            if url_pattern in self.urls_processed or len(
                    self.urls_processed) >= self.LINKS_LIMIT:
                return False
            else:
                self.urls_processed.add(url_pattern)
            # print 'Entered Queue:', url
            for _ in self.rules_set:
                # rewrite & lang check
                if self.rewrite and not _[7]:
                    continue
                elif self.lang and self.lang != 'unknown':
                    if _[6] and self.lang != _[6]:
                        continue

                # root_only
                if _[5] and url != '/':
                    continue

                full_url = url.rstrip('/') + _[0]
                self._enqueue_request(url.rstrip('/'), full_url, _)

            if self.full_scan and url.count('/') >= 2:
                self._enqueue('/'.join(url.split('/')[:-2]) +
                              '/')  # sub folder enqueue

            for _ in self.user_scripts:
                self._enqueue_script(_, url.rstrip('/'))

            return True
        except Exception as e:
            print '[_enqueue.exception] %s' % str(e)
            return False

    #
    def request_index(self, path):
        try:
            status, headers, html_doc = self._http_request(path)
            if status != 200:
                try:
                    html_doc = self.conn_pool.urlopen(
                        'GET',
                        self.url,
                        headers=headers_without_range,
                        retries=1).data
                    html_doc = decode_response_text(html_doc)
                except Exception as e:
                    pass
            self.index_status, self.index_headers, self.index_html_doc = status, headers, html_doc  # save index content
            soup = BeautifulSoup(self.index_html_doc, "html.parser")
            for link in soup.find_all('a'):
                url = link.get('href', '').strip()
                self.index_a_urls.add(url)

        except Exception as e:
            logging.error('[request_index Exception] %s' % str(e))
            traceback.print_exc()

    def gather_info(self):
        if not self.server:
            self.server = check_server(self.index_headers.get('server', ''))

        if not self.lang:
            self.lang, self.framework = check_lang(self.base_url,
                                                   self.index_headers)

        if self.lang == 'unknown':
            for url in self.index_a_urls:
                url, depth = cal_depth(self, url)
                lang = check_lang_url(url)
                if lang != 'unknown':
                    self.lang = lang
                    break
        self.rewrite = check_rewrite(self.server, self.lang)

    def crawl_index(self):
        for url in self.index_a_urls:
            url, depth = cal_depth(self, url)
            if depth <= self.max_depth:
                self._enqueue(url)
        if self.find_text(self.index_html_doc):
            self.results['/'] = []
            m = re.search('<title>(.*?)</title>', self.index_html_doc)
            title = m.group(1) if m else ''
            _ = {
                'status': self.index_status,
                'url': '%s%s' % (self.base_url, self.path),
                'title': title
            }
            if _ not in self.results['/']:
                self.results['/'].append(_)

    #
    def load_all_urls_from_log_file(self):
        try:
            with open(self.log_file) as inFile:
                for line in inFile.xreadlines():
                    _ = line.strip().split()
                    if len(_) == 3 and (_[2].find('^^^200') > 0
                                        or _[2].find('^^^403') > 0
                                        or _[2].find('^^^302') > 0):
                        url, depth = cal_depth(self, _[1])
                        self._enqueue(url)
        except Exception as e:
            logging.error('[load_all_urls_from_log_file Exception] %s' %
                          str(e))
            traceback.print_exc()

    #
    def find_text(self, html_doc):
        for _text in self.text_to_find:
            if html_doc.find(_text) > 0:
                return True
        for _regex in self.regex_to_find:
            if _regex.search(html_doc) > 0:
                return True
        return False

    #
    def find_exclude_text(self, html_doc):
        for _text in self.text_to_exclude:
            if html_doc.find(_text) >= 0:
                return True
        for _regex in self.regex_to_exclude:
            if _regex.search(html_doc):
                return True
        return False

    def apply_rules(self, item):
        url_description, tag, status_to_match, content_type, content_type_no, root_only, lang, rewrite = item
        prefix = url_description['prefix']
        url = url_description['full_url']
        # print url
        url = url.replace('{sub}', self.domain_sub)
        if url.find('{hostname_or_folder}') >= 0:
            _url = url[:url.find('{hostname_or_folder}')]
            folders = _url.split('/')
            for _folder in reversed(folders):
                if _folder not in ['', '.', '..']:
                    url = url.replace('{hostname_or_folder}', _folder)
                    break
        url = url.replace('{hostname_or_folder}', self.domain_sub)
        url = url.replace('{hostname}', self.domain_sub)

        if not item or not url:
            return False, None, None, None

        # print '[%s]' % url.strip()
        try:
            status, headers, html_doc = self._http_request(url)
            cur_content_type = headers.get('content-type', '')

            if self.find_exclude_text(html_doc):  # excluded text found
                return False, status, headers, html_doc

            if ('html' in cur_content_type or 'text' in cur_content_type) and \
                                    0 <= len(html_doc) <= 10:  # text too short
                return False, status, headers, html_doc

            if cur_content_type.find('image/') >= 0:  # exclude image
                return False, status, headers, html_doc

            valid_item = False
            if self.find_text(html_doc):
                valid_item = True
            else:
                if cur_content_type.find(
                        'application/json') >= 0 and not url.endswith(
                            '.json'):  # no json
                    return False, status, headers, html_doc

                if status != status_to_match and status != 206:  # status in [301, 302, 400, 404, 501, 502, 503, 505]
                    return False, status, headers, html_doc

                if tag:
                    if html_doc.find(tag) >= 0:
                        valid_item = True
                    else:
                        return False, status, headers, html_doc  # tag mismatch

                if (content_type and cur_content_type.find(content_type) < 0) \
                        or (content_type_no and cur_content_type.find(content_type_no) >= 0):
                    return False, status, headers, html_doc  # type mismatch

                if self.has_404 or status != self._404_status:
                    if status_to_match in (200, 206) and status == 206:
                        valid_item = True
                    elif status_to_match and status != status_to_match:  # status mismatch
                        return False, status, headers, html_doc
                    elif status_to_match != 403 and status == 403:
                        return False, status, headers, html_doc
                    else:
                        valid_item = True

                if not self.has_404 and status in (
                        200, 206) and url != '/' and not tag:
                    _len = len(html_doc)
                    _min = min(_len, self.len_404_doc)
                    if _min == 0:
                        _min = 10.0
                    if float(_len - self.len_404_doc) / _min > 0.3:
                        valid_item = True

                if status == 206 and tag == '' and cur_content_type.find(
                        'text') < 0 and cur_content_type.find('html') < 0:
                    valid_item = True

            return valid_item, status, headers, html_doc

        except Exception as e:
            logging.error('[_scan_worker.Exception][3][%s] %s' % (url, str(e)))
            traceback.print_exc()

    #
    def _scan_worker(self):
        while self.url_queue.qsize() > 0:
            if time.time() - self.START_TIME > self.TIME_OUT:
                self.url_queue.queue.clear()
                print_msg('[ERROR] Timed out task: %s' % self.host)
                return
            try:
                item = self.url_queue.get(timeout=0.1)
            except Exception as e:
                print e
                return
            try:
                if len(item) == 2:  # User Script
                    check_func = getattr(item[0], 'do_check')
                    check_func(self, item[1])
                    continue
            except Exception as e:
                logging.error('[_scan_worker Exception] [1] %s' % str(e))
                traceback.print_exc()
                continue

            url_description, tag, status_to_match, content_type, content_type_no, root_only, lang, rewrite = item
            prefix = url_description['prefix']
            url = url_description['full_url']
            valid_item, status, headers, html_doc = self.apply_rules(item)

            try:
                if valid_item:
                    m = re.search('<title>(.*?)</title>', html_doc)
                    title = m.group(1) if m else ''
                    self.lock.acquire()
                    # print '[+] [Prefix:%s] [%s] %s' % (prefix, status, 'http://' + self.host +  url)
                    if prefix not in self.results:
                        self.results[prefix] = []
                    _ = {
                        'status': status,
                        'url': '%s%s' % (self.base_url, url),
                        'title': title
                    }
                    if _ not in self.results[prefix]:
                        self.results[prefix].append(_)
                    self.lock.release()

                if len(self.results) >= 10:
                    print '[Warning] Over 10 vulnerabilities found [%s], seems to be false positives.' % prefix
                    self.url_queue.queue.clear()
            except Exception as e:
                logging.error('[_scan_worker.Exception][2][%s] %s' %
                              (url, str(e)))
                traceback.print_exc()

    #
    def scan(self, threads=6):
        try:
            all_threads = []
            for i in range(threads):
                t = threading.Thread(target=self._scan_worker)
                t.start()
                all_threads.append(t)
            for t in all_threads:
                t.join()
            '''
            for key in self.results.keys():
                if len(self.results[key]) > 5:  # Over 5 URLs found under this folder, show first only
                    self.results[key] = self.results[key][:1]
            '''
            return '%s:%s' % (self.host, self.port), self.results
        except Exception as e:
            print '[scan exception] %s' % str(e)
        self.conn_pool.close()

예제 #5

파일 보기

파일: BBScan.py 프로젝트: binarytrails/bbscan

class Scanner(object):
    def __init__(self, q_results, timeout=600, args=None):
        self.q_results = q_results
        self.args = args
        self.start_time = time.time()
        self.time_out = timeout
        self.links_limit = 100  # max number of folders to scan

        self._init_rules()
        self._init_scripts()

        self.url_queue = Queue.Queue()  # all urls to scan
        self.urls_processed = set()     # processed urls
        self.urls_enqueued = set()      # entered queue urls
        self.urls_crawled = set()

        self.lock = threading.Lock()
        self.results = {}
        self.log_file = None
        self._404_status = -1
        self.conn_pool = None
        self.index_status, self.index_headers, self.index_html_doc = None, {}, ''
        self.scheme, self.host, self.port, self.path = None, None, None, None
        self.domain_sub = ''
        self.base_url = ''
        self.max_depth = 0
        self.len_404_doc = 0
        self.has_http = None
        self.ports_open = None
        self.ports_closed = None
        self.no_scripts = None
        self.status_502_count = 0

    def print_msg(self, msg):
        self.q_results.put(msg)

    def reset_scanner(self):
        self.start_time = time.time()
        self.url_queue.queue.clear()
        self.urls_processed.clear()
        self.urls_enqueued.clear()
        self.urls_crawled.clear()
        self.results.clear()
        self.log_file = None
        self._404_status = -1
        self.conn_pool = None
        self.index_status, self.index_headers, self.index_html_doc = None, {}, ''
        self.scheme, self.host, self.port, self.path = None, None, None, None
        self.domain_sub = ''
        self.base_url = ''
        self.status_502_count = 0

    # scan from a given URL
    def init_from_url(self, target):
        self.reset_scanner()
        self.scheme = target['scheme']
        self.host = target['host']
        self.port = target['port']
        self.path = target['path']
        self.has_http = target['has_http']
        self.ports_open = target['ports_open']
        self.ports_closed = target['ports_closed']
        self.no_scripts = target['no_scripts'] if 'no_scripts' in target else 0
        self.domain_sub = get_domain_sub(self.host)
        self.init_final()
        return True

    def init_from_log_file(self, log_file):
        self.reset_scanner()
        self.log_file = log_file
        self.scheme, self.host, self.path = self._parse_url_from_file()
        self.domain_sub = get_domain_sub(self.host)
        if self.host:
            if self.host.find(':') > 0:
                _ret = self.host.split(':')
                self.host = _ret[0]
                self.port = _ret[1]
            elif self.scheme == 'https':
                self.port = 443
            elif self.scheme == 'http':
                self.port = 80
            else:
                self.port = None
            if not is_port_open(self.host, self.port):
                self.print_msg('[Port Not Open] %s:%s' % (self.host, self.port))
                return False
            self.has_http = True
            self.no_scripts = 1
            self.init_final()
            self.load_all_urls_from_log_file()
            return True
        else:
            host = os.path.basename(log_file).replace('.log', '')
            try:
                socket.gethostbyname(host)
                self.init_from_url(host)     # Fix Me
                return True
            except Exception as e:
                self.print_msg('[ERROR] Invalid host from log name: %s' % host)
                return False

    def init_final(self):
        try:
            if self.conn_pool:
                self.conn_pool.close()
        except Exception as e:
            pass

        if self.scheme == 'http' and self.port == 80 or self.scheme == 'https' and self.port == 443:
            self.base_url = '%s://%s' % (self.scheme, self.host)
        else:
            self.base_url = '%s://%s:%s' % (self.scheme, self.host, self.port)

        if self.has_http:
            self.print_msg('Scan %s' % self.base_url)
        else:
            self.print_msg('Scan %s:%s' % (self.host, self.port) if self.port else 'Scan %s' % self.host)

        if self.has_http:
            if self.scheme == 'https':
                self.conn_pool = HTTPSConnPool(self.host, port=self.port, maxsize=self.args.t,
                                               headers=config.default_headers)
            else:
                self.conn_pool = HTTPConnPool(self.host, port=self.port, maxsize=self.args.t,
                                              headers=config.default_headers)
            if self.args.require_index_doc:
                self.crawl('/', do_not_process_links=True)

        if self.no_scripts != 1:   # 不是重复目标 80 443 跳转的，不需要重复扫描
            # 当前目标disable， 或者 全局开启插件扫描
            if self.args.scripts_only or not self.no_scripts:
                for _ in self.user_scripts:
                    self.url_queue.put((_, '/'))

        if not self.has_http or self.args.scripts_only:    # 未发现HTTP服务 或  只依赖插件扫描
            return

        self.max_depth = cal_depth(self, self.path)[1] + 5
        if self.args.no_check404:
            self._404_status = 404
        else:
            self.check_404_existence()
        if self._404_status == -1:
            self.print_msg('[Warning] HTTP 404 check failed <%s:%s>' % (self.host, self.port))
        elif self._404_status != 404:
            self.print_msg('[Warning] %s has no HTTP 404.' % self.base_url)
        _path, _depth = cal_depth(self, self.path)

        self.enqueue('/')
        if _path != '/' and not self.log_file:
            self.enqueue(_path)

    #
    def _parse_url_from_file(self):
        url = ''
        with open(self.log_file) as infile:
            for _line in infile.xreadlines():
                _line = _line.strip()
                if _line and len(_line.split()) >= 3:
                    url = _line.split()[1]
                    break
        return parse_url(url)

    # load urls from rules/*.txt
    def _init_rules(self):
        self.text_to_find = []
        self.regex_to_find = []
        self.text_to_exclude = []
        self.regex_to_exclude = []
        self.rules_set = set()
        self.rules_set_root_only = set()

        p_tag = re.compile('{tag="(.*?)"}')
        p_status = re.compile(r'{status=(\d{3})}')
        p_content_type = re.compile('{type="(.*?)"}')
        p_content_type_no = re.compile('{type_no="(.*?)"}')

        _files = self.args.rule_files if self.args.rule_files else glob.glob('rules/*.txt')

        for rule_file in _files:
            with open(rule_file, 'r') as infile:
                vul_type = os.path.basename(rule_file)[:-4]
                for url in infile.xreadlines():
                    url = url.strip()
                    if url.startswith('/'):
                        _ = p_tag.search(url)
                        tag = _.group(1) if _ else ''

                        _ = p_status.search(url)
                        status = int(_.group(1)) if _ else 0

                        _ = p_content_type.search(url)
                        content_type = _.group(1) if _ else ''

                        _ = p_content_type_no.search(url)
                        content_type_no = _.group(1) if _ else ''

                        root_only = True if url.find('{root_only}') >= 0 else False

                        rule = (url.split()[0], tag, status, content_type, content_type_no, root_only, vul_type)
                        if root_only:
                            if rule not in self.rules_set_root_only:
                                self.rules_set_root_only.add(rule)
                            else:
                                self.print_msg('Duplicated root only rule: %s' % str(rule))
                        else:
                            if rule not in self.rules_set:
                                self.rules_set.add(rule)
                            else:
                                self.print_msg('Duplicated rule: %s' % str(rule))

        re_text = re.compile('{text="(.*)"}')
        re_regex_text = re.compile('{regex_text="(.*)"}')

        file_path = 'rules/white.list'
        if not os.path.exists(file_path):
            self.print_msg('[ERROR] File not exist: %s' % file_path)
            return
        for _line in open(file_path):
            _line = _line.strip()
            if not _line or _line.startswith('#'):
                continue
            _m = re_text.search(_line)
            if _m:
                self.text_to_find.append(_m.group(1).decode('utf-8', 'ignore'))
            else:
                _m = re_regex_text.search(_line)
                if _m:
                    self.regex_to_find.append(re.compile(_m.group(1).decode('utf-8', 'ignore')))

        file_path = 'rules/black.list'
        if not os.path.exists(file_path):
            self.print_msg('[ERROR] File not exist: %s' % file_path)
            return
        for _line in open(file_path):
            _line = _line.strip()
            if not _line or _line.startswith('#'):
                continue
            _m = re_text.search(_line)
            if _m:
                self.text_to_exclude.append(_m.group(1).decode('utf-8', 'ignore'))
            else:
                _m = re_regex_text.search(_line)
                if _m:
                    self.regex_to_exclude.append(re.compile(_m.group(1).decode('utf-8', 'ignore')))

    def _init_scripts(self):
        self.user_scripts = []
        if self.args.no_scripts:    # 全局禁用插件，无需导入
            return
        for _script in glob.glob('scripts/*.py'):
            script_name_origin = os.path.basename(_script)
            script_name = script_name_origin.replace('.py', '')
            if self.args.script:    # 只导入指定的脚本
                if script_name not in self.args.script and script_name_origin not in self.args.script:
                    continue
            if script_name.startswith('_'):
                continue
            try:
                self.user_scripts.append(importlib.import_module('scripts.%s' % script_name))
            except Exception as e:
                self.print_msg('[ERROR] Fail to load script %s' % script_name)

    def http_request(self, url, headers=config.default_headers, timeout=20):
        try:
            if not url:
                url = '/'
            if not self.conn_pool:
                return -1, {}, ''
            if self.args.debug:
                self.print_msg('--> %s' % self.base_url + url)
            resp = self.conn_pool.urlopen('GET', self.base_url + url,
                                          headers=headers, assert_same_host=False,
                                          redirect=False, timeout=timeout, retries=0)
            if resp.headers.get('content-type', '').find('text') >= 0 \
                    or resp.headers.get('content-type', '').find('html') >= 0 \
                    or int(resp.headers.get('content-length', '0')) <= 20480:  # 1024 * 20
                html_doc = decode_response_text(resp.data)
            else:
                html_doc = ''

            if resp.status == 502:    # 502出现3次以上，排除该站点
                self.status_502_count += 1
                if self.status_502_count > 3:
                    self.url_queue.queue.clear()
                    try:
                        if self.conn_pool:
                            self.conn_pool.close()
                    except Exception as e:
                        pass
                    self.conn_pool = None
                    # self.print_msg('Website 502: %s' % self.base_url)

            return resp.status, resp.headers, html_doc
        except urllib3.exceptions.MaxRetryError as e:
            return -1, {}, ''
        except TypeError as e:
            return -1, {}, ''
        except Exception as e:
            self.print_msg(str(e))
            return -1, {}, ''

    # check existence of status 404
    def check_404_existence(self):
        try:
            try:
                self._404_status, _, html_doc = self.http_request('/BBScan-404-existence-check')
            except Exception as e:
                self.print_msg('[Warning] HTTP 404 check failed: %s' % self.base_url)
                self._404_status, _, html_doc = -1, {}, ''
            if self._404_status != 404:
                self.len_404_doc = len(html_doc)
        except Exception as e:
            self.print_msg('[Check_404] Exception %s %s' % (self.base_url, str(e)))

    #
    def enqueue(self, url):
        try:
            url = str(url)
        except Exception as e:
            return False
        try:
            url_pattern = re.sub(r'\d+', '{num}', url)
            if url_pattern in self.urls_processed or len(self.urls_processed) >= self.links_limit:
                return False

            self.urls_processed.add(url_pattern)
            # self.print_msg('Entered Queue: %s' % url)
            if not self.args.no_crawl:   # no crawl
                self.crawl(url)
            if self._404_status != -1:    # valid web service
                rule_set_to_process = [self.rules_set, self.rules_set_root_only] if url == '/' else [self.rules_set]
                for rule_set in rule_set_to_process:
                    for _ in rule_set:
                        if _[5] and url != '/':    # root only
                            continue
                        try:
                            full_url = url.rstrip('/') + _[0]
                        except Exception as e:
                            continue
                        if full_url in self.urls_enqueued:
                            continue
                        url_description = {'prefix': url.rstrip('/'), 'full_url': full_url}
                        item = (url_description, _[1], _[2], _[3], _[4], _[5], _[6])
                        self.url_queue.put(item)
                        self.urls_enqueued.add(full_url)

            if self.args.full_scan and url.count('/') >= 2:
                self.enqueue('/'.join(url.split('/')[:-2]) + '/')  # sub folder enqueue

            if url != '/'and not self.no_scripts:
                for script in self.user_scripts:
                    self.url_queue.put((script, url))
            return True
        except Exception as e:
            self.print_msg('[_enqueue.exception] %s' % str(e))
            return False

    #
    def crawl(self, path, do_not_process_links=False):
        try:
            # increase body size to 200 KB
            headers = dict(config.default_headers, Range='bytes=0-204800')
            status, headers, html_doc = self.http_request(path, headers=headers)
            if path == '/':
                self.index_status, self.index_headers, self.index_html_doc = status, headers, html_doc
            if not self.args.no_crawl and not do_not_process_links and html_doc:
                soup = BeautifulSoup(html_doc, "html.parser")
                for link in soup.find_all('a'):
                    url = link.get('href', '').strip()
                    if url.startswith('..'):
                        continue
                    if not url.startswith('/') and url.find('//') < 0:   # relative path
                        url = path + url
                    url, depth = cal_depth(self, url)
                    # print url, depth
                    if depth <= self.max_depth:
                        self.enqueue(url)
                #
                ret = self.find_text(html_doc)
                if ret:
                    if '/' not in self.results:
                        self.results['/'] = []
                    m = re.search('<title>(.*?)</title>', html_doc)
                    title = m.group(1) if m else ''
                    _ = {'status': status, 'url': '%s%s' % (self.base_url, path), 'title': title, 'vul_type': ret[1]}
                    if _ not in self.results['/']:
                        self.results['/'].append(_)

        except Exception as e:
            self.print_msg('[crawl Exception] %s %s' % (path, str(e)))

    #
    def load_all_urls_from_log_file(self):
        try:
            with open(self.log_file) as infile:
                for _line in infile.xreadlines():
                    _ = _line.strip().split()
                    if len(_) == 3 and (_[2].find('^^^200') > 0 or _[2].find('^^^403') > 0 or _[2].find('^^^302') > 0):
                        url, depth = cal_depth(self, _[1])
                        self.enqueue(url)
        except Exception as e:
            self.print_msg('[load_all_urls_from_log_file] %s' % str(e))

    #
    def find_text(self, html_doc):
        for _text in self.text_to_find:
            if html_doc.find(_text) >= 0:
                return True, 'Found [%s]' % _text
        for _regex in self.regex_to_find:
            if _regex.search(html_doc):
                return True, 'Found Regex [%s]' % _regex.pattern
        return False

    #
    def find_exclude_text(self, html_doc):
        for _text in self.text_to_exclude:
            if html_doc.find(_text) >= 0:
                return True
        for _regex in self.regex_to_exclude:
            if _regex.search(html_doc):
                return True
        return False

    #
    def scan_worker(self):
        while True:
            if time.time() - self.start_time > self.time_out:
                self.url_queue.queue.clear()
                self.print_msg('[ERROR] Timed out task: %s' % self.base_url)
                return
            try:
                item = self.url_queue.get(timeout=0.1)
            except Exception as e:
                return
            try:
                if len(item) == 2:  # Script Scan
                    check_func = getattr(item[0], 'do_check')
                    # self.print_msg('Begin %s %s' % (os.path.basename(item[0].__file__), item[1]))
                    check_func(self, item[1])
                    # self.print_msg('End %s %s' % (os.path.basename(item[0].__file__), item[1]))
                    continue
                else:
                    url_description, tag, status_to_match, content_type, content_type_no, root_only, vul_type = item
                    prefix = url_description['prefix']
                    url = url_description['full_url']

                    if url.find('{sub}') >= 0:
                        if not self.domain_sub:
                            continue
                        url = url.replace('{sub}', self.domain_sub)

            except Exception as e:
                self.print_msg('[scan_worker.1] %s' % str(e))
                self.print_msg(traceback.format_exc())
                continue
            if not item or not url:
                break

            try:
                status, headers, html_doc = self.http_request(url)
                cur_content_type = headers.get('content-type', '')
                cur_content_length = headers.get('content-length', len(html_doc))

                if self.find_exclude_text(html_doc):  # excluded text found
                    continue

                if 0 <= int(cur_content_length) <= 10:  # text too short
                    continue

                if cur_content_type.find('image/') >= 0:  # exclude image
                    continue

                if content_type != 'application/json' and cur_content_type.find('application/json') >= 0 and \
                        not url.endswith('.json'):    # invalid json
                    continue

                if content_type and cur_content_type.find(content_type) < 0 \
                        or content_type_no and cur_content_type.find(content_type_no) >= 0:
                    continue    # content type mismatch

                if tag and html_doc.find(tag) < 0:
                    continue    # tag mismatch

                if self.find_text(html_doc):
                    valid_item = True
                else:
                    # status code check
                    if status_to_match == 206 and status != 206:
                        continue
                    if status_to_match in (200, 206) and status in (200, 206):
                        valid_item = True
                    elif status_to_match and status != status_to_match:
                        continue
                    elif status in (403, 404) and status != status_to_match:
                        continue
                    else:
                        valid_item = True

                    if status == self._404_status and url != '/':
                        len_doc = len(html_doc)
                        len_sum = self.len_404_doc + len_doc
                        if len_sum == 0 or (0.4 <= float(len_doc) / len_sum <= 0.6):
                            continue

                if valid_item:
                    m = re.search('<title>(.*?)</title>', html_doc)
                    title = m.group(1) if m else ''
                    self.lock.acquire()
                    # self.print_msg('[+] [Prefix:%s] [%s] %s' % (prefix, status, 'http://' + self.host +  url))
                    if prefix not in self.results:
                        self.results[prefix] = []
                    _ = {'status': status, 'url': '%s%s' % (self.base_url, url), 'title': title, 'vul_type': vul_type}
                    if _ not in self.results[prefix]:
                        self.results[prefix].append(_)
                    self.lock.release()
            except Exception as e:
                self.print_msg('[scan_worker.2][%s] %s' % (url, str(e)))
                traceback.print_exc()

    #
    def scan(self, threads=6):
        try:
            all_threads = []
            for i in range(threads):
                t = threading.Thread(target=self.scan_worker)
                t.start()
                all_threads.append(t)
            for t in all_threads:
                t.join()

            for key in self.results.keys():
                # Over 5 URLs found under this folder, keep the first one only
                if len(self.results[key]) > 5:
                    self.results[key] = self.results[key][:1]
            return self.base_url.lstrip('unknown://').rstrip(':None'), self.results
        except Exception as e:
            self.print_msg('[scan exception] %s' % str(e))
        self.conn_pool.close()

예제 #6

파일 보기

class Scanner(object):
    def __init__(self, timeout=600, args=None):
        self.args = args
        self.start_time = time.time()
        self.time_out = timeout
        self.links_limit = 100  # max number of folders to scan

        self._init_rules()
        self._init_scripts()

        self.url_queue = Queue.Queue()  # all urls to scan
        self.urls_processed = set()  # processed urls
        self.urls_enqueued = set()  # entered queue urls
        self.urls_crawled = set()

        self.lock = threading.Lock()
        self.results = {}
        self.log_file = None
        self._404_status = -1
        self.conn_pool = None
        self.index_status, self.index_headers, self.index_html_doc = None, {}, ''
        self.url = ''
        self.schema, self.host, self.port, self.path = None, None, None, None
        self.domain_sub = self.base_url = ''
        self.has_status_404 = True
        self.max_depth = 0
        self.len_404_doc = 0

    # reset scanner
    def reset_scanner(self):
        self.start_time = time.time()
        self.url_queue.queue.clear()
        self.urls_processed.clear()
        self.urls_enqueued.clear()
        self.urls_crawled.clear()
        self.results.clear()
        self.log_file = None
        self._404_status = -1
        self.conn_pool = None
        self.index_status, self.index_headers, self.index_html_doc = None, {}, ''

    # scan from a given URL
    def init_from_url(self, url):
        self.reset_scanner()
        self.url = 'http://' + url if url.find('://') < 0 else url
        self.schema, self.host, self.path = parse_url(url)
        self.domain_sub = get_domain_sub(self.host)
        self.init_final()

    def init_from_log_file(self, log_file):
        self.reset_scanner()
        self.log_file = log_file
        self.schema, self.host, self.path = self._parse_url_from_file()
        self.domain_sub = get_domain_sub(self.host)
        if self.host:
            self.load_all_urls_from_log_file()
            self.init_final()
            return True
        else:
            host = os.path.basename(log_file).replace('.log', '')
            try:
                socket.gethostbyname(host)
                self.init_from_url(host)
                return True
            except Exception as e:
                print_msg('[ERROR] Invalid host from log name: %s' % host)
                return False

    #
    def init_final(self):
        try:
            if self.conn_pool:
                self.conn_pool.close()
        except Exception as e:
            pass
        default_port = 443 if self.schema.lower() == 'https' else 80
        self.host, self.port = self.host.split(
            ':') if self.host.find(':') > 0 else (self.host, default_port)
        self.port = int(self.port)
        if self.schema == 'http' and self.port == 80 or self.schema == 'https' and self.port == 443:
            self.base_url = '%s://%s' % (self.schema, self.host)
        else:
            self.base_url = '%s://%s:%s' % (self.schema, self.host, self.port)

        is_port_open = self.is_port_open()
        if is_port_open:
            if self.schema == 'https':
                self.conn_pool = HTTPSConnPool(self.host,
                                               port=self.port,
                                               maxsize=self.args.t * 2,
                                               headers=HEADERS)
            else:
                self.conn_pool = HTTPConnPool(self.host,
                                              port=self.port,
                                              maxsize=self.args.t * 2,
                                              headers=HEADERS)

        if self.args.scripts_only or (not is_port_open
                                      and not self.args.no_scripts):
            for _ in self.user_scripts:
                self.url_queue.put((_, '/'))
            print_msg('Scan with scripts: %s' % self.host)
            return

        if not is_port_open:
            return

        self.max_depth = cal_depth(self, self.path)[1] + 5
        if self.args.no_check404:
            self._404_status = 404
            self.has_status_404 = True
        else:
            self.check_404_existence()
        if self._404_status == -1:
            print_msg('[Warning] HTTP 404 check failed <%s:%s>' %
                      (self.host, self.port))
        elif not self.has_status_404:
            print_msg('[Warning] %s has no HTTP 404.' % self.base_url)
        _path, _depth = cal_depth(self, self.path)
        self.enqueue('/')
        self.enqueue(_path)
        if not self.args.no_crawl and not self.log_file:
            self.crawl(_path)

    def is_port_open(self):
        try:
            s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            s.settimeout(5.0)
            if s.connect_ex((self.host, int(self.port))) == 0:
                print_msg('scan web: %s:%s' % (self.host, self.port))
                return True
            else:
                print_msg('[Warning] Fail to connect to %s' % self.base_url)
                return False
        except Exception as e:
            return False
        finally:
            s.setsockopt(socket.SOL_SOCKET, socket.SO_LINGER,
                         struct.pack('ii', 1, 0))
            s.close()

    #
    def _parse_url_from_file(self):
        url = ''
        with open(self.log_file) as infile:
            for _line in infile.xreadlines():
                _line = _line.strip()
                if _line and len(_line.split()) >= 3:
                    url = _line.split()[1]
                    break
        return parse_url(url)

    #
    # load urls from rules/*.txt
    def _init_rules(self):
        self.text_to_find = []
        self.regex_to_find = []
        self.text_to_exclude = []
        self.regex_to_exclude = []
        self.rules_set = set()
        self.rules_set_root_only = set()

        p_tag = re.compile('{tag="(.*?)"}')
        p_status = re.compile(r'{status=(\d{3})}')
        p_content_type = re.compile('{type="(.*?)"}')
        p_content_type_no = re.compile('{type_no="(.*?)"}')

        for rule_file in glob.glob('rules/*.txt'):
            with open(rule_file, 'r') as infile:
                vul_type = os.path.basename(rule_file)[:-4]
                for url in infile.xreadlines():
                    url = url.strip()
                    if url.startswith('/'):
                        _ = p_tag.search(url)
                        tag = _.group(1) if _ else ''

                        _ = p_status.search(url)
                        status = int(_.group(1)) if _ else 0

                        _ = p_content_type.search(url)
                        content_type = _.group(1) if _ else ''

                        _ = p_content_type_no.search(url)
                        content_type_no = _.group(1) if _ else ''

                        root_only = True if url.find(
                            '{root_only}') >= 0 else False

                        rule = (url.split()[0], tag, status, content_type,
                                content_type_no, root_only, vul_type)
                        if root_only:
                            if rule not in self.rules_set_root_only:
                                self.rules_set_root_only.add(rule)
                            else:
                                print_msg('Duplicated root only rule: %s' %
                                          str(rule))
                        else:
                            if rule not in self.rules_set:
                                self.rules_set.add(rule)
                            else:
                                print_msg('Duplicated rule: %s' % str(rule))

        re_text = re.compile('{text="(.*)"}')
        re_regex_text = re.compile('{regex_text="(.*)"}')

        file_path = 'rules/white.list'
        if not os.path.exists(file_path):
            print_msg('[ERROR] File not exist: %s' % file_path)
            return
        for _line in open(file_path):
            _line = _line.strip()
            if not _line or _line.startswith('#'):
                continue
            _m = re_text.search(_line)
            if _m:
                self.text_to_find.append(_m.group(1).decode('utf-8', 'ignore'))
            else:
                _m = re_regex_text.search(_line)
                if _m:
                    self.regex_to_find.append(
                        re.compile(_m.group(1).decode('utf-8', 'ignore')))

        file_path = 'rules/black.list'
        if not os.path.exists(file_path):
            print_msg('[ERROR] File not exist: %s' % file_path)
            return
        for _line in open(file_path):
            _line = _line.strip()
            if not _line or _line.startswith('#'):
                continue
            _m = re_text.search(_line)
            if _m:
                self.text_to_exclude.append(
                    _m.group(1).decode('utf-8', 'ignore'))
            else:
                _m = re_regex_text.search(_line)
                if _m:
                    self.regex_to_exclude.append(
                        re.compile(_m.group(1).decode('utf-8', 'ignore')))

    #
    def _init_scripts(self):
        self.user_scripts = []
        if self.args.no_scripts:  # disable user scripts scan
            return
        for _script in glob.glob('scripts/*.py'):
            script_name = os.path.basename(_script).replace('.py', '')
            if script_name.startswith('_'):
                continue
            try:
                self.user_scripts.append(
                    importlib.import_module('scripts.%s' % script_name))
            except Exception as e:
                print_msg('[ERROR] Fail to load script %s' % script_name)

    #
    def http_request(self, url, headers=HEADERS, timeout=30):
        try:
            if not url:
                url = '/'
            # print_msg('request %s' % self.base_url + url)
            resp = self.conn_pool.urlopen('GET',
                                          self.base_url + url,
                                          headers=headers,
                                          redirect=False,
                                          timeout=timeout,
                                          retries=0)
            status = resp.status
            if resp.headers.get('content-type', '').find('text') >= 0 \
                    or resp.headers.get('content-type', '').find('html') >= 0 \
                    or int(resp.headers.get('content-length', '0')) <= 20480:  # 1024 * 20
                html_doc = decode_response_text(resp.data)
            else:
                html_doc = ''

            return status, resp.headers, html_doc
        except Exception as e:
            return -1, {}, ''

    # check existence of HTTP 404
    def check_404_existence(self):
        try:
            try:
                self._404_status, _, html_doc = self.http_request(
                    '/BBScan-404-existence-check')
            except Exception as e:
                print_msg('[Warning] HTTP 404 check failed <%s:%s>' %
                          (self.host, self.port))
                self._404_status, _, html_doc = -1, {}, ''
            if self._404_status == 404:
                self.has_status_404 = True
            else:
                self.has_status_404 = False
                self.len_404_doc = len(html_doc)
        except Exception as e:
            logging.error('[Check_404] Exception %s %s' %
                          (self.base_url, str(e)))

    #
    def enqueue(self, url):
        try:
            url = str(url)
            url_pattern = re.sub(r'\d+', '{num}', url)
            if url_pattern in self.urls_processed or len(
                    self.urls_processed) >= self.links_limit:
                return False
            else:
                self.urls_processed.add(url_pattern)
            # print_msg('Entered Queue: %s' % url)
            self.crawl(url)
            if self._404_status != -1:  # valid web service
                rule_set_to_process = [
                    self.rules_set, self.rules_set_root_only
                ] if url == '/' else [self.rules_set]
                for rule_set in rule_set_to_process:
                    for _ in rule_set:
                        if _[5] and url != '/':  # root only
                            continue
                        try:
                            full_url = url.rstrip('/') + _[0]
                        except Exception as e:
                            continue
                        if full_url in self.urls_enqueued:
                            continue
                        url_description = {
                            'prefix': url.rstrip('/'),
                            'full_url': full_url
                        }
                        item = (url_description, _[1], _[2], _[3], _[4], _[5],
                                _[6])
                        self.url_queue.put(item)
                        self.urls_enqueued.add(full_url)

            if self.args.full_scan and url.count('/') >= 2:
                self.enqueue('/'.join(url.split('/')[:-2]) +
                             '/')  # sub folder enqueue

            for script in self.user_scripts:
                self.url_queue.put((script, url))
            return True
        except Exception as e:
            print '[_enqueue.exception] %s' % str(e)
            return False

    #
    def crawl(self, path):
        try:
            headers = dict(
                HEADERS,
                Range='bytes=0-204800')  # allowed size increased to 200 kb
            status, headers, html_doc = self.http_request(path,
                                                          headers=headers)
            if path == '/':
                self.index_status, self.index_headers, self.index_html_doc = status, headers, html_doc
            if self.index_html_doc:
                soup = BeautifulSoup(html_doc, "html.parser")
                for link in soup.find_all('a'):
                    url = link.get('href', '').strip()
                    if url.startswith('..'):
                        continue
                    if not url.startswith('/') and url.find('//') < 0:
                        url = path + url
                    url, depth = cal_depth(self, url)
                    # print url, depth
                    if depth <= self.max_depth:
                        self.enqueue(url)
                ret = self.find_text(html_doc)
                if ret:
                    if '/' not in self.results:
                        self.results['/'] = []
                    m = re.search('<title>(.*?)</title>', html_doc)
                    title = m.group(1) if m else ''
                    _ = {
                        'status': status,
                        'url': '%s%s' % (self.base_url, path),
                        'title': title,
                        'vul_type': ret[1]
                    }
                    if _ not in self.results['/']:
                        self.results['/'].append(_)

        except Exception as e:
            print_msg('[crawl Exception] %s %s' % (path, str(e)))
            traceback.print_exc()

    #
    def load_all_urls_from_log_file(self):
        try:
            with open(self.log_file) as infile:
                for _line in infile.xreadlines():
                    _ = _line.strip().split()
                    if len(_) == 3 and (_[2].find('^^^200') > 0
                                        or _[2].find('^^^403') > 0
                                        or _[2].find('^^^302') > 0):
                        url, depth = cal_depth(self, _[1])
                        self.enqueue(url)
        except Exception as e:
            print_msg('[load_all_urls_from_log_file] %s' % str(e))

    #
    def find_text(self, html_doc):
        for _text in self.text_to_find:
            if html_doc.find(_text) >= 0:
                return True, 'Found [%s]' % _text
        for _regex in self.regex_to_find:
            if _regex.search(html_doc):
                return True, 'Found Regex [%s]' % _regex.pattern
        return False

    #
    def find_exclude_text(self, html_doc):
        for _text in self.text_to_exclude:
            if html_doc.find(_text) >= 0:
                return True
        for _regex in self.regex_to_exclude:
            if _regex.search(html_doc):
                return True
        return False

    #
    def scan_worker(self):
        while self.url_queue.qsize() > 0:
            if time.time() - self.start_time > self.time_out:
                self.url_queue.queue.clear()
                print_msg('[ERROR] Timed out task: %s' % self.base_url)
                return
            try:
                item = self.url_queue.get(timeout=0.1)
            except Exception as e:
                return
            try:
                if len(item) == 2:  # Script Scan
                    check_func = getattr(item[0], 'do_check')
                    # print_msg('Begin %s %s' % (os.path.basename(item[0].__file__), item[1]))
                    check_func(self, item[1])
                    # print_msg('End %s %s' % (os.path.basename(item[0].__file__), item[1]))
                    continue
                else:
                    url_description, tag, status_to_match, content_type, content_type_no, root_only, vul_type = item
                    prefix = url_description['prefix']
                    url = url_description['full_url']

                    if url.find('{sub}') >= 0:
                        if not self.domain_sub:
                            continue
                        url = url.replace('{sub}', self.domain_sub)

            except Exception as e:
                print_msg('[scan_worker.1] %s' % str(e))
                traceback.print_exc()
                continue
            if not item or not url:
                break

            # print_msg('[%s]' % url.strip())
            try:
                status, headers, html_doc = self.http_request(url)
                cur_content_type = headers.get('content-type', '')
                cur_content_length = headers.get('content-length',
                                                 len(html_doc))

                if self.find_exclude_text(html_doc):  # excluded text found
                    continue

                if 0 <= int(cur_content_length) <= 10:  # text too short
                    continue

                if cur_content_type.find('image/') >= 0:  # exclude image
                    continue

                if content_type != 'application/json' and cur_content_type.find('application/json') >= 0 and \
                        not url.endswith('.json'):    # invalid json
                    continue

                if content_type and cur_content_type.find(content_type) < 0 \
                        or content_type_no and cur_content_type.find(content_type_no) >= 0:
                    continue  # content type mismatch

                if tag and html_doc.find(tag) < 0:
                    continue  # tag mismatch

                if self.find_text(html_doc):
                    valid_item = True
                else:
                    # status code check
                    if status_to_match == 206 and status != 206:
                        continue
                    if status_to_match in (200, 206) and status in (200, 206):
                        valid_item = True
                    elif status_to_match and status != status_to_match:
                        continue
                    elif status in (403, 404) and status != status_to_match:
                        continue
                    else:
                        valid_item = True

                    if status == self._404_status and url != '/':
                        len_doc = len(html_doc)
                        len_sum = self.len_404_doc + len_doc
                        if len_sum == 0 or (0.4 <= float(len_doc) / len_sum <=
                                            0.6):
                            continue

                if valid_item:
                    m = re.search('<title>(.*?)</title>', html_doc)
                    title = m.group(1) if m else ''
                    self.lock.acquire()
                    # print '[+] [Prefix:%s] [%s] %s' % (prefix, status, 'http://' + self.host +  url)
                    if prefix not in self.results:
                        self.results[prefix] = []
                    _ = {
                        'status': status,
                        'url': '%s%s' % (self.base_url, url),
                        'title': title,
                        'vul_type': vul_type
                    }
                    if _ not in self.results[prefix]:
                        self.results[prefix].append(_)
                    self.lock.release()
            except Exception as e:
                print_msg('[scan_worker.2][%s] %s' % (url, str(e)))
                traceback.print_exc()

    #
    def scan(self, threads=6):
        try:
            all_threads = []
            for i in range(threads):
                t = threading.Thread(target=self.scan_worker)
                t.start()
                all_threads.append(t)
            for t in all_threads:
                t.join()

            for key in self.results.keys():
                if len(
                        self.results[key]
                ) > 5:  # Over 5 URLs found under this folder, show first only
                    self.results[key] = self.results[key][:1]
            return self.host, self.results
        except Exception as e:
            print '[scan exception] %s' % str(e)
        self.conn_pool.close()