def parse_svn_entries(url): description_file = 'SVN entries file at' description_dir = "SVN entries Dir at" target_url = url + "/.svn/entries" fetcher = Fetcher() response_code, content, headers = fetcher.fetch_url( target_url, conf.user_agent, conf.fetch_timeout_secs, limit_len=False, add_headers=base_headers) if response_code in conf.expected_file_responses and content: tokens = content.decode().split('\n') if 'dir' in tokens: for pos, token in enumerate(tokens): if token == 'dir': # Fetch more entries recursively if tokens[pos - 1] != '': textutils.output_debug(' - Svn Plugin: Found dir: ' + url + '/' + tokens[pos - 1]) if conf.allow_download: textutils.output_info( ' - Svn Plugin: Downloading: ' + url + '/' + tokens[pos - 1] + '\r') else: textutils.output_found(description_dir + ' at: ' + url + '/' + tokens[pos - 1]) # Parse next parse_svn_entries(url + "/" + tokens[pos - 1]) elif token == 'file': textutils.output_debug(' - Svn Plugin: Found file: ' + url + '/' + tokens[pos - 1]) if conf.allow_download: textutils.output_info(' - Svn Plugin: Downloading: ' + url + '/' + tokens[pos - 1] + '\r') # Fetch text-base file path = url + "/.svn/text-base" + '/' + tokens[ pos - 1] + ".svn-base" fetcher = Fetcher() response_code, content, headers = fetcher.fetch_url( path, conf.user_agent, conf.fetch_timeout_secs, limit_len=False) save_file(url + '/' + tokens[pos - 1], content) else: textutils.output_found(description_file + ' at: ' + url + '/' + tokens[pos - 1])
def execute(): """ Fetch /.svn/entries and parse for target paths """ textutils.output_info(' - Svn Plugin: Searching for /.svn/entries') target_url = conf.target_base_path + "/.svn/entries" fetcher = Fetcher() response_code, content, headers = fetcher.fetch_url( target_url, conf.user_agent, conf.fetch_timeout_secs, limit_len=False) svn_legacy = True if response_code in conf.expected_file_responses and content: if conf.allow_download: textutils.output_info( ' - Svn Plugin: /.svn/entries found! crawling... (will download files to output/)' ) else: textutils.output_info( ' - Svn Plugin: /.svn/entries found! crawling... (use -a to download files instead of printing)' ) # test for version 1.7+ target_url = conf.target_base_path + "/.svn/wc.db" fetcher = Fetcher() response_code, content, headers = fetcher.fetch_url( target_url, conf.user_agent, conf.fetch_timeout_secs, limit_len=False) #if response_code in conf.expected_file_responses and content: # textutils.output_info(' - Svn Plugin: SVN 1.7+ detected, parsing wc.db') # svn_legacy = False # save_file(conf.target_base_path + '/wc.db', content) # Process index if svn_legacy: # parse entries parse_svn_entries(conf.target_base_path) else: parse_svn_17_db(conf.target_base_path + '/wc.db') # Clean up display if conf.allow_download: textutils.output_info('') else: textutils.output_info(' - Svn Plugin: no /.svn/entries found')
def __init__(self, thread_id, output=True): Thread.__init__(self) self.kill_received = False self.thread_id = thread_id self.fetcher = Fetcher() self.output = output reset_behavior_database()
def execute(): """ Fetch /.svn/entries and parse for target paths """ textutils.output_info(' - Svn Plugin: Searching for /.svn/entries') target_url = conf.target_base_path + "/.svn/entries" fetcher = Fetcher() response_code, content, headers = fetcher.fetch_url( target_url, conf.user_agent, conf.fetch_timeout_secs, limit_len=False) if response_code is 200 or response_code is 302: if conf.allow_download: textutils.output_info( ' - Svn Plugin: /.svn/entries found! crawling... (will download files to output/)' ) else: textutils.output_info( ' - Svn Plugin: /.svn/entries found! crawling... (use -a to download files instead of printing)' ) # parse entries parse_svn_entries(conf.target_base_path) # Clean up display if conf.allow_download: textutils.output_info('') else: textutils.output_info(' - Svn Plugin: no /.svn/entries found')
def execute(): """ Fetch sitemap.xml and add each entry as a target """ current_template = dict(conf.path_template) current_template['description'] = 'sitemap.xml entry' target_url = urljoin(conf.target_base_path, "/sitemap.xml") fetcher = Fetcher() response_code, content, headers = fetcher.fetch_url( target_url, conf.user_agent, conf.fetch_timeout_secs, limit_len=False, add_headers={}) if not isinstance(content, str): content = content.decode('utf-8', 'ignore') if response_code is 200 or response_code is 302 and content: regexp = re.compile('(?im).*<url>\s*<loc>(.*)</loc>\s*</url>.*') matches = re.findall(regexp, content) textutils.output_debug("SitemapXML plugin") added = 0 for match in matches: if not isinstance(match, str): match = match.decode('utf-8', 'ignore') parsed = urlparse(match) if parsed.path: new_path = parsed.path else: continue # Remove trailing / if new_path.endswith('/'): new_path = new_path[:-1] if add_path(new_path): added += 1 textutils.output_debug(" - Added: %s from /sitemap.xml" % new_path) if added > 0: textutils.output_info(' - SitemapXML Plugin: added %d base paths ' 'using /sitemap.xml' % added) else: textutils.output_info(' - SitemapXML Plugin: no usable entries ' 'in /sitemap.xml') else: textutils.output_info( ' - SitemapXML Plugin: /sitemap.xml not found on ' 'target site')
def execute(): """ Fetch /robots.txt and add the disallowed paths as target """ current_template = dict(conf.path_template) current_template['description'] = 'Robots.txt entry' target_url = urljoin(conf.target_base_path, "/robots.txt") fetcher = Fetcher() response_code, content, headers = fetcher.fetch_url( target_url, conf.user_agent, conf.fetch_timeout_secs, limit_len=False) if isinstance(content, str): content = content.encode('utf-8') if response_code is 200 or response_code is 302 and content: if not isinstance(content, str): content = content.decode('utf-8', 'ignore') matches = re.findall(r'Disallow:\s*/[a-zA-Z0-9-/\r]+\n', content) textutils.output_debug(content) added = 0 for match in matches: # Filter out some characters match = filter(lambda c: c not in ' *?.\n\r\t', match) if match: match = ''.join(match) # Split on ':' splitted = match.split(':') if splitted[1]: target_path = splitted[1] textutils.output_debug(target_path) # Remove trailing / if target_path.endswith('/'): target_path = target_path[:-1] current_template = current_template.copy() current_template['url'] = target_path database.paths.append(current_template) textutils.output_debug(' - Robots Plugin Added: ' + str(target_path) + ' from robots.txt') added += 1 if added > 0: textutils.output_info(' - Robots Plugin: added ' + str(added) + ' base paths using /robots.txt') else: textutils.output_info( ' - Robots Plugin: no usable entries in /robots.txt') else: textutils.output_info( ' - Robots Plugin: /robots.txt not found on target site')
def get_session_cookies(): """ Fetch initial session cookies """ textutils.output_info('Fetching session cookie') path = conf.path_template.copy() path['url'] = '/' # Were not using the fetch cache for session cookie sampling fetcher = Fetcher() code, content, headers = fetcher.fetch_url('/', conf.user_agent, 10) if code is 200: cookies = headers.get('Set-Cookie') if cookies: database.session_cookie = cookies
class Scheduler(object): test = Tester() fetch = Fetcher() redis = RedisDataBase() def _test(self, queue): while True: if not self.redis.is_empty: print(getTime('测试代理模块开始启动')) self.test.run() else: print(getTime('代理池枯竭, 测试代理模块被迫进入休眠状态')) queue.put('True') delay(60 * 60) def _fetch(self, queue, flag=False): while True: if not queue.empty(): flag = queue.get() if flag: flag = False print(getTime('代理池枯竭, 获取代理模块被迫启动')) self.fetch.run() if times() in [6, 18]: # 设置时间为6、18 时启动 print(getTime('获取代理模块开始启动')) self.fetch.run() print(getTime('获取完成, 获取代理模块进入休眠状态')) def _app(self): app.run(host='0.0.0.0') def all_run(self): print(getTime('代理池开始运行......')) tester = Process(target=self._test, args=(queue,)) tester.start() fetcher = Process(target=self._fetch, args=(queue,)) fetcher.start() app = Process(target=self._app) app.start()
def execute(): """ Fetch /.svn/entries and parse for target paths """ current_template = dict(conf.path_template) current_template['description'] = '/.svn/entries found directory' target_url = urljoin(conf.target_base_path, "/.svn/entries") fetcher = Fetcher() response_code, content, headers = fetcher.fetch_url( target_url, conf.user_agent, conf.fetch_timeout_secs, limit_len=False) if response_code is 200 or response_code is 302 and content: added = 0 try: tree = ElementTree.fromstring(content) entry_tags = tree.iter() if entry_tags: for entry in entry_tags: kind = entry.attrib.get("kind") if kind and kind == "dir": current_template = current_template.copy() current_template['url'] = '/' + entry.attrib["name"] database.paths.append(current_template) added += 1 except Exception: textutils.output_info( ' - Svn Plugin: no usable entries in /.svn/entries') else: if added > 0: textutils.output_info(' - Svn Plugin: added ' + str(added) + ' base paths using /.svn/entries') else: textutils.output_info( ' - Svn Plugin: no usable entries in /.svn/entries') else: textutils.output_info(' - Svn Plugin: no /.svn/entries found')
def __init__(self, thread_id, output=True): Thread.__init__(self) self.kill_received = False self.thread_id = thread_id self.fetcher = Fetcher() self.output = output