Python Crawler.map примеры использования

Язык программирования: Python

Пространство имен/Пакет: crawler

Класс/Тип: Crawler

Метод/Функция: map

Примеров на hotexamples.com: 15

Python Crawler.map - 15 примеров найдено. Это лучшие примеры Python кода для crawler.Crawler.map, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Crawler(30)

__init__(27)

map(15)

assets_json(5)

info(4)

visit(3)

analyze(3)

_get_url_contents(3)

__subclasses__(3)

get_Torrents_List(2)

load(2)

download_Page_Files(2)

crawl_web(2)

reset(2)

response(2)

add_data(2)

_same_host(2)

_has_product(2)

return_soup(2)

dump(2)

AddURLs(2)

Grab(2)

Start(2)

ToggleTOR(2)

Update(2)

isValidUrl(1)

open_browser(1)

open(1)

mostrarConfig(1)

GetInfoNames(1)

GetInfoValues(1)

keepUrl(1)

isAlive(1)

poll(1)

insert_root(1)

GetTasks(1)

headers(1)

get_top_news(1)

get_result(1)

get_records(1)

get_pagelist(1)

get_headers(1)

get_forms(1)

output_csv(1)

recuperarInf(1)

post(1)

scrape_registrations(1)

submit(1)

silent(1)

show_imagelist(1)

Пример #1

Показать файл

Файл: __init__test.py Проект: n8downs/opus_challenge

    def test_http_counts_as_internal_link(self):
        self.svc.get('requests')._expect("https://example.com", 200, '<a href="http://example.com/insecure">click here</a>')
        self.svc.get('requests')._expect("http://example.com/insecure", 200, '<different><stuff>')

        crawler = Crawler(self.svc, "https://example.com")
        siteMap = crawler.map()
        self.assertEqual({"https://example.com":{"assets":[], "links":["http://example.com/insecure"]},
                          "http://example.com/insecure": {"assets": [], "links": []}}, siteMap)

Пример #2

Показать файл

Файл: __init__test.py Проект: n8downs/opus_challenge

    def test_query_params_are_captured(self):
        self.svc.get('requests')._expect("http://example.com", 200, '<a href="/?foo=bar">click here</a>')
        self.svc.get('requests')._expect("http://example.com/?foo=bar", 200, '<different><stuff>')

        crawler = Crawler(self.svc, "http://example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com":{"assets":[], "links":["/?foo=bar"]},
                          "http://example.com/?foo=bar": {"assets": [], "links": []}}, siteMap)

Пример #3

Показать файл

Файл: __init__test.py Проект: n8downs/opus_challenge

    def test_disallowed_urls_are_not_fetched(self):
        self.svc.get('requests')._expect("http://example.com", 200, '<a href="http://example.com/admin">click here</a>')
        self.svc.get('RobotFileParser')._disallowed_urls['http://example.com/admin'] = True

        crawler = Crawler(self.svc, "http://example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com":{"assets":[], "links":["http://example.com/admin"]},
                         "http://example.com/admin": {"error": "Disallowed by robots.txt"}}, siteMap)

Пример #4

Показать файл

Файл: __init__test.py Проект: n8downs/opus_challenge

    def test_relative_links_are_captured(self):
        self.svc.get('requests')._expect("http://example.com", 200, '<a href="foobar/">click here</a>')
        self.svc.get('requests')._expect("http://example.com/foobar/", 200, '')

        crawler = Crawler(self.svc, "http://example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com":{"assets":[], "links":["foobar/"]},
                          "http://example.com/foobar/": {"assets": [], "links": []}}, siteMap)

Пример #5

Показать файл

Файл: __init__test.py Проект: n8downs/opus_challenge

    def test_redirects_to_unknown_protocols_are_handled(self):
        self.svc.get('requests')._expect(
            "http://example.com",
            200,
            '<a href="https://www.example.com/foobar">click here</a>',
            finalUrl="foo:bar")

        crawler = Crawler(self.svc, "example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com":{"error": "Error fetching url: InvalidSchema('Unrecognized scheme: foo:bar',)"}}, siteMap)

Пример #6

Показать файл

Файл: __init__test.py Проект: n8downs/opus_challenge

    def test_dont_rerequest_a_redirect(self):
        self.svc.get('requests')._expect(
            "http://example.com",
            200,
            '<a href="http://example.com">click here</a>',
            finalUrl="http://example.com/foo")

        crawler = Crawler(self.svc, "example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com/foo":{"original_url": "http://example.com", "assets":[], "links":["http://example.com"]},
                          "http://example.com": {"redirects_to": "http://example.com/foo"}}, siteMap)

Пример #7

Показать файл

Файл: __init__test.py Проект: n8downs/opus_challenge

    def test_redirects_are_handled_nicely(self):
        self.svc.get('requests')._expect(
            "http://example.com",
            200,
            '<a href="https://www.example.com/foobar">click here</a>',
            finalUrl="https://www.example.com")
        self.svc.get('requests')._expect("https://www.example.com/foobar", 200, '')

        crawler = Crawler(self.svc, "example.com")
        siteMap = crawler.map()
        self.assertEqual({"https://www.example.com":{"original_url": "http://example.com", "assets":[], "links":["https://www.example.com/foobar"]},
                          "https://www.example.com/foobar": {"assets": [], "links": []},
                          "http://example.com": {"redirects_to": "https://www.example.com"}}, siteMap)

Пример #8

Показать файл

Файл: map.py Проект: n8downs/opus_challenge

def main():
    domain = sys.argv[1]
    svc = RealServiceProvider()
    crawler = Crawler(svc, domain)
    print("Mapping...")
    siteMap = crawler.map(verbose=True)
    print("Complete.")
    print()
    print("SiteMap:")
    for url, data in siteMap.items():
        print(url)

        for prop in ["error", "original_url", "redirects_to"]:
            if data.get(prop, False):
                print("  %s: %s" % (prop, data[prop]))

        for prop in ["assets", "links"]:
            print("  %s:" % (prop,))
            for p in data.get(prop, []):
                print("    %s" % (p,))

Пример #9

Показать файл

Файл: __init__test.py Проект: n8downs/opus_challenge

    def test_subdomain_doesnt_count_as_internal(self):
        self.svc.get('requests')._expect("http://example.com", 200, '<a href="http://api.example.com">click here</a>')

        crawler = Crawler(self.svc, "http://example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com":{"assets":[], "links":["http://api.example.com"]}}, siteMap)

Пример #10

Показать файл

Файл: __init__test.py Проект: n8downs/opus_challenge

    def test_leaving_out_scheme_in_domain_is_fine(self):
        self.svc.get('requests')._expect("http://example.com", 200, '<a href="http://example.com">click here</a>')

        crawler = Crawler(self.svc, "example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com":{"assets":[], "links":["http://example.com"]}}, siteMap)

Пример #11

Показать файл

Файл: __init__test.py Проект: n8downs/opus_challenge

    def test_recursive_links_dont_cause_re_fetch(self):
        self.svc.get('requests')._expect("http://example.com", 200, '<a href="http://example.com">click here</a>')

        crawler = Crawler(self.svc, "http://example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com":{"assets":[], "links":["http://example.com"]}}, siteMap)

Пример #12

Показать файл

Файл: __init__test.py Проект: n8downs/opus_challenge

    def test_external_links_are_not_captured(self):
        self.svc.get('requests')._expect("http://example.com", 200, '<a href="http://foobar.com">click here</a>')

        crawler = Crawler(self.svc, "http://example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com":{"assets":[], "links":["http://foobar.com"]}}, siteMap)

Пример #13

Показать файл

Файл: __init__test.py Проект: n8downs/opus_challenge

    def test_link_href_is_captured_as_asset(self):
        self.svc.get('requests')._expect("http://example.com", 200, '<link href="/css/foo.css"></script>')

        crawler = Crawler(self.svc, "http://example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com":{"assets":["/css/foo.css"], "links":[]}}, siteMap)

Пример #14

Показать файл

Файл: __init__test.py Проект: n8downs/opus_challenge

    def test_image_src_is_captured_as_asset(self):
        self.svc.get('requests')._expect("http://example.com", 200, '<img src="/img/foo.png">')

        crawler = Crawler(self.svc, "http://example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com":{"assets":["/img/foo.png"], "links":[]}}, siteMap)

Пример #15

Показать файл

Файл: __init__test.py Проект: n8downs/opus_challenge

    def test_error_urls_are_noted(self):
        self.svc.get('requests')._expect("http://example.com", 400, 'Error: bad request')

        crawler = Crawler(self.svc, "http://example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com":{"error": "Error fetching url. Response code: 400"}}, siteMap)