Python time_machine 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: plugins.wayback

메소드/함수: time_machine

hotexamples.com에서의 예제들: 2

Python time_machine - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 plugins.wayback.time_machine에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def zap(input_url, archive, domain, host, internal, robots, proxies):
    """Extract links from robots.txt and sitemap.xml."""
    if archive:
        print('%s Fetching URLs from archive.org' % run)
        if False:
            archived_urls = time_machine(domain, 'domain')
        else:
            archived_urls = time_machine(host, 'host')
        print('%s Retrieved %i URLs from archive.org' %
              (good, len(archived_urls) - 1))
        for url in archived_urls:
            verb('Internal page', url)
            internal.add(url)
    # Makes request to robots.txt
    response = requests.get(input_url + '/robots.txt',
                            proxies=random.choice(proxies),
                            verify=False).text
    # Making sure robots.txt isn't some fancy 404 page
    if '<body' not in response:
        # If you know it, you know it
        matches = re.findall(r'Allow: (.*)|Disallow: (.*)', response)
        if matches:
            # Iterating over the matches, match is a tuple here
            for match in matches:
                # One item in match will always be empty so will combine both
                # items
                match = ''.join(match)
                # If the URL doesn't use a wildcard
                if '*' not in match:
                    url = input_url + match
                    # Add the URL to internal list for crawling
                    internal.add(url)
                    # Add the URL to robots list
                    robots.add(url)
            print('%s URLs retrieved from robots.txt: %s' %
                  (good, len(robots)))
    # Makes request to sitemap.xml
    response = requests.get(input_url + '/sitemap.xml',
                            proxies=random.choice(proxies),
                            verify=False).text
    # Making sure robots.txt isn't some fancy 404 page
    if '<body' not in response:
        matches = xml_parser(response)
        if matches:  # if there are any matches
            print('%s URLs retrieved from sitemap.xml: %s' %
                  (good, len(matches)))
            for match in matches:
                verb('Internal page', match)
                # Cleaning up the URL and adding it to the internal list for
                # crawling
                internal.add(match)

예제 #2

파일 보기

파일: zap.py 프로젝트: security-geeks/Photon

def zap(input_url, archive, domain, host, internal, robots, proxies):
    """Extract links from robots.txt and sitemap.xml."""
    if archive:
        print('%s Fetching URLs from archive.org' % run)
        if False:
            archived_urls = time_machine(domain, 'domain')
        else:
            archived_urls = time_machine(host, 'host')
        print('%s Retrieved %i URLs from archive.org' % (
            good, len(archived_urls) - 1))
        for url in archived_urls:
            verb('Internal page', url)
            internal.add(url)
    # Makes request to robots.txt
    response = requests.get(input_url + '/robots.txt',
                            proxies=random.choice(proxies)).text
    # Making sure robots.txt isn't some fancy 404 page
    if '<body' not in response:
        # If you know it, you know it
        matches = re.findall(r'Allow: (.*)|Disallow: (.*)', response)
        if matches:
            # Iterating over the matches, match is a tuple here
            for match in matches:
                # One item in match will always be empty so will combine both
                # items
                match = ''.join(match)
                # If the URL doesn't use a wildcard
                if '*' not in match:
                    url = input_url + match
                    # Add the URL to internal list for crawling
                    internal.add(url)
                    # Add the URL to robots list
                    robots.add(url)
            print('%s URLs retrieved from robots.txt: %s' % (good, len(robots)))
    # Makes request to sitemap.xml
    response = requests.get(input_url + '/sitemap.xml',
                            proxies=random.choice(proxies)).text
    # Making sure robots.txt isn't some fancy 404 page
    if '<body' not in response:
        matches = xml_parser(response)
        if matches: # if there are any matches
            print('%s URLs retrieved from sitemap.xml: %s' % (
                good, len(matches)))
            for match in matches:
                verb('Internal page', match)
                # Cleaning up the URL and adding it to the internal list for
                # crawling
                internal.add(match)