Python Fetcher.Fetcher 예제들, core.fetcher.Fetcher.Fetcher Python 예제들

예제 #1

0

파일 보기

파일: Svn.py 프로젝트: GoSecure/tachyon

def parse_svn_entries(url):
    description_file = 'SVN entries file at'
    description_dir = "SVN entries Dir at"
    target_url = url + "/.svn/entries"
    fetcher = Fetcher()

    response_code, content, headers = fetcher.fetch_url(
        target_url,
        conf.user_agent,
        conf.fetch_timeout_secs,
        limit_len=False,
        add_headers=base_headers)

    if response_code in conf.expected_file_responses and content:
        tokens = content.decode().split('\n')
        if 'dir' in tokens:
            for pos, token in enumerate(tokens):
                if token == 'dir':
                    # Fetch more entries recursively
                    if tokens[pos - 1] != '':
                        textutils.output_debug(' - Svn Plugin: Found dir: ' +
                                               url + '/' + tokens[pos - 1])

                        if conf.allow_download:
                            textutils.output_info(
                                ' - Svn Plugin: Downloading: ' + url + '/' +
                                tokens[pos - 1] + '\r')
                        else:
                            textutils.output_found(description_dir + ' at: ' +
                                                   url + '/' + tokens[pos - 1])

                        # Parse next
                        parse_svn_entries(url + "/" + tokens[pos - 1])

                elif token == 'file':
                    textutils.output_debug(' - Svn Plugin: Found file: ' +
                                           url + '/' + tokens[pos - 1])
                    if conf.allow_download:
                        textutils.output_info(' - Svn Plugin: Downloading: ' +
                                              url + '/' + tokens[pos - 1] +
                                              '\r')
                        # Fetch text-base file
                        path = url + "/.svn/text-base" + '/' + tokens[
                            pos - 1] + ".svn-base"
                        fetcher = Fetcher()
                        response_code, content, headers = fetcher.fetch_url(
                            path,
                            conf.user_agent,
                            conf.fetch_timeout_secs,
                            limit_len=False)
                        save_file(url + '/' + tokens[pos - 1], content)
                    else:
                        textutils.output_found(description_file + ' at: ' +
                                               url + '/' + tokens[pos - 1])

예제 #2

0

파일 보기

파일: Svn.py 프로젝트: GoSecure/tachyon

def execute():
    """ Fetch /.svn/entries and parse for target paths """

    textutils.output_info(' - Svn Plugin: Searching for /.svn/entries')
    target_url = conf.target_base_path + "/.svn/entries"

    fetcher = Fetcher()
    response_code, content, headers = fetcher.fetch_url(
        target_url, conf.user_agent, conf.fetch_timeout_secs, limit_len=False)
    svn_legacy = True

    if response_code in conf.expected_file_responses and content:

        if conf.allow_download:
            textutils.output_info(
                ' - Svn Plugin: /.svn/entries found! crawling... (will download files to output/)'
            )
        else:
            textutils.output_info(
                ' - Svn Plugin: /.svn/entries found! crawling... (use -a to download files instead of printing)'
            )

        # test for version 1.7+
        target_url = conf.target_base_path + "/.svn/wc.db"
        fetcher = Fetcher()
        response_code, content, headers = fetcher.fetch_url(
            target_url,
            conf.user_agent,
            conf.fetch_timeout_secs,
            limit_len=False)

        #if response_code in conf.expected_file_responses and content:
        #    textutils.output_info(' - Svn Plugin: SVN 1.7+ detected, parsing wc.db')
        #    svn_legacy = False
        #    save_file(conf.target_base_path + '/wc.db', content)

        # Process index
        if svn_legacy:
            # parse entries
            parse_svn_entries(conf.target_base_path)
        else:
            parse_svn_17_db(conf.target_base_path + '/wc.db')

        # Clean up display
        if conf.allow_download:
            textutils.output_info('')
    else:
        textutils.output_info(' - Svn Plugin: no /.svn/entries found')

예제 #3

0

파일 보기

파일: workers.py 프로젝트: ldesauln/tachyon

 def __init__(self, thread_id, output=True):
     Thread.__init__(self)
     self.kill_received = False
     self.thread_id = thread_id
     self.fetcher = Fetcher()
     self.output = output
     reset_behavior_database()

예제 #4

0

파일 보기

파일: Svn.py 프로젝트: ppepos/tachyon

def execute():
    """ Fetch /.svn/entries and parse for target paths """

    textutils.output_info(' - Svn Plugin: Searching for /.svn/entries')
    target_url = conf.target_base_path + "/.svn/entries"

    fetcher = Fetcher()
    response_code, content, headers = fetcher.fetch_url(
        target_url, conf.user_agent, conf.fetch_timeout_secs, limit_len=False)
    if response_code is 200 or response_code is 302:
        if conf.allow_download:
            textutils.output_info(
                ' - Svn Plugin: /.svn/entries found! crawling... (will download files to output/)'
            )
        else:
            textutils.output_info(
                ' - Svn Plugin: /.svn/entries found! crawling... (use -a to download files instead of printing)'
            )
        # parse entries
        parse_svn_entries(conf.target_base_path)

        # Clean up display
        if conf.allow_download:
            textutils.output_info('')
    else:
        textutils.output_info(' - Svn Plugin: no /.svn/entries found')

예제 #5

0

파일 보기

파일: SitemapXML.py 프로젝트: tee2015/tachyon

def execute():
    """ Fetch sitemap.xml and add each entry as a target """

    current_template = dict(conf.path_template)
    current_template['description'] = 'sitemap.xml entry'

    target_url = urljoin(conf.target_base_path, "/sitemap.xml")
    fetcher = Fetcher()
    response_code, content, headers = fetcher.fetch_url(
        target_url,
        conf.user_agent,
        conf.fetch_timeout_secs,
        limit_len=False,
        add_headers={})

    if not isinstance(content, str):
        content = content.decode('utf-8', 'ignore')

    if response_code is 200 or response_code is 302 and content:

        regexp = re.compile('(?im).*<url>\s*<loc>(.*)</loc>\s*</url>.*')
        matches = re.findall(regexp, content)

        textutils.output_debug("SitemapXML plugin")

        added = 0
        for match in matches:
            if not isinstance(match, str):
                match = match.decode('utf-8', 'ignore')
            parsed = urlparse(match)
            if parsed.path:
                new_path = parsed.path
            else:
                continue

            # Remove trailing /
            if new_path.endswith('/'):
                new_path = new_path[:-1]

            if add_path(new_path):
                added += 1

            textutils.output_debug(" - Added: %s from /sitemap.xml" % new_path)

        if added > 0:
            textutils.output_info(' - SitemapXML Plugin: added %d base paths '
                                  'using /sitemap.xml' % added)
        else:
            textutils.output_info(' - SitemapXML Plugin: no usable entries '
                                  'in /sitemap.xml')

    else:
        textutils.output_info(
            ' - SitemapXML Plugin: /sitemap.xml not found on '
            'target site')

예제 #6

0

파일 보기

def execute():
    """ Fetch /robots.txt and add the disallowed paths as target """
    current_template = dict(conf.path_template)
    current_template['description'] = 'Robots.txt entry'

    target_url = urljoin(conf.target_base_path, "/robots.txt")

    fetcher = Fetcher()
    response_code, content, headers = fetcher.fetch_url(
        target_url, conf.user_agent, conf.fetch_timeout_secs, limit_len=False)
    if isinstance(content, str):
        content = content.encode('utf-8')

    if response_code is 200 or response_code is 302 and content:
        if not isinstance(content, str):
            content = content.decode('utf-8', 'ignore')
        matches = re.findall(r'Disallow:\s*/[a-zA-Z0-9-/\r]+\n', content)
        textutils.output_debug(content)

        added = 0
        for match in matches:
            # Filter out some characters
            match = filter(lambda c: c not in ' *?.\n\r\t', match)

            if match:
                match = ''.join(match)

            # Split on ':'
            splitted = match.split(':')
            if splitted[1]:
                target_path = splitted[1]
                textutils.output_debug(target_path)

                # Remove trailing /
                if target_path.endswith('/'):
                    target_path = target_path[:-1]

                current_template = current_template.copy()
                current_template['url'] = target_path
                database.paths.append(current_template)
                textutils.output_debug(' - Robots Plugin Added: ' +
                                       str(target_path) + ' from robots.txt')
                added += 1

        if added > 0:
            textutils.output_info(' - Robots Plugin: added ' + str(added) +
                                  ' base paths using /robots.txt')
        else:
            textutils.output_info(
                ' - Robots Plugin: no usable entries in /robots.txt')

    else:
        textutils.output_info(
            ' - Robots Plugin: /robots.txt not found on target site')

예제 #7

0

파일 보기

def get_session_cookies():
    """ Fetch initial session cookies """
    textutils.output_info('Fetching session cookie')
    path = conf.path_template.copy()
    path['url'] = '/'

    # Were not using the fetch cache for session cookie sampling
    fetcher = Fetcher()

    code, content, headers = fetcher.fetch_url('/', conf.user_agent, 10)
    if code is 200:
        cookies = headers.get('Set-Cookie')
        if cookies:
            database.session_cookie = cookies

예제 #8

0

파일 보기

파일: run.py 프로젝트: DengRS/Spiders-Project

class Scheduler(object):
    test = Tester()
    fetch = Fetcher()
    redis = RedisDataBase()

    def _test(self, queue):
        while True:
            if not self.redis.is_empty:
                print(getTime('测试代理模块开始启动'))
                self.test.run()
            else:
                print(getTime('代理池枯竭, 测试代理模块被迫进入休眠状态'))
                queue.put('True')
                delay(60 * 60)

    def _fetch(self, queue, flag=False):
        while True:
            if not queue.empty():
                flag = queue.get()
            if flag:
                flag = False
                print(getTime('代理池枯竭, 获取代理模块被迫启动'))
                self.fetch.run()
            if times() in [6, 18]:
                # 设置时间为6、18 时启动
                print(getTime('获取代理模块开始启动'))
                self.fetch.run()
                print(getTime('获取完成, 获取代理模块进入休眠状态'))
    def _app(self):
        app.run(host='0.0.0.0')

    def all_run(self):
        print(getTime('代理池开始运行......'))
        tester = Process(target=self._test, args=(queue,))
        tester.start()
        fetcher = Process(target=self._fetch, args=(queue,))
        fetcher.start()
        app = Process(target=self._app)
        app.start()

예제 #9

0

파일 보기

파일: Svn.py 프로젝트: hubertperron/tachyon

def execute():
    """ Fetch /.svn/entries and parse for target paths """
    current_template = dict(conf.path_template)
    current_template['description'] = '/.svn/entries found directory'

    target_url = urljoin(conf.target_base_path, "/.svn/entries")
    fetcher = Fetcher()
    response_code, content, headers = fetcher.fetch_url(
        target_url, conf.user_agent, conf.fetch_timeout_secs, limit_len=False)

    if response_code is 200 or response_code is 302 and content:
        added = 0
        try:
            tree = ElementTree.fromstring(content)
            entry_tags = tree.iter()
            if entry_tags:
                for entry in entry_tags:
                    kind = entry.attrib.get("kind")
                    if kind and kind == "dir":
                        current_template = current_template.copy()
                        current_template['url'] = '/' + entry.attrib["name"]
                        database.paths.append(current_template)
                        added += 1

        except Exception:
            textutils.output_info(
                ' - Svn Plugin: no usable entries in /.svn/entries')
        else:
            if added > 0:
                textutils.output_info(' - Svn Plugin: added ' + str(added) +
                                      ' base paths using /.svn/entries')
            else:
                textutils.output_info(
                    ' - Svn Plugin: no usable entries in /.svn/entries')
    else:
        textutils.output_info(' - Svn Plugin: no /.svn/entries found')

예제 #10

0

파일 보기

파일: workers.py 프로젝트: ldesauln/tachyon

 def __init__(self, thread_id, output=True):
     Thread.__init__(self)
     self.kill_received = False
     self.thread_id = thread_id
     self.fetcher = Fetcher()
     self.output = output