Python Downloader примеры использования

Язык программирования: Python

Пространство имен/Пакет: chapter03.downloader

Класс/Тип: Downloader

Примеров на hotexamples.com: 10

Python Downloader - 10 примеров найдено. Это лучшие примеры Python кода для chapter03.downloader.Downloader, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Downloader(7)

Основные методы

Downloader (7)

Пример #1

Показать файл

Файл: selenium_search.py Проект: motein/NetSpiderCode

def get_dynamic():
    url = 'http://example.webscraping.com/dynamic'
    D = Downloader()
    content = D(url)
    #print(content.decode('utf-8'))
    tree = html.fromstring(content)
    print(tree.cssselect('#results')[0].text_content())

Пример #2

Показать файл

Файл: search.py Проект: motein/NetSpiderCode

def search1():
    template_url = 'http://example.webscraping.com/ajax/search.json?page={}&page_size=10&search_term={}'
    countries = set()
    download = Downloader(cache=MongoCache())

    for letter in string.ascii_lowercase:
        page = 0
        
        while True:
            url=template_url.format(page, letter)
            print("URL: ", url)
            html = download(url)
            try:
                ajax = json.loads(html)
            except ValueError as e:
                print(e)
                ajax = None
            else:
                for record in ajax['records']:
                    countries.add(record['country'])
            page += 1
            if ajax is None or page >= ajax['num_pages']:
                break
    
    open('countries.txt', 'w').write('\n'.join(sorted(countries)))

Пример #3

Показать файл

Файл: search.py Проект: motein/NetSpiderCode

def search2():
    writer = csv.writer(open('countries.csv', 'w'))
    D = Downloader()
    html = D('http://example.webscraping.com/places/default/search?page=0&page_size=1000&search_term=.')
    print(html.decode('utf-8'))
    ajax = json.loads(html)
    for record in ajax['records']:
        writer.writerow([record['country']])

Пример #4

Показать файл

def threaded_crawler(seed_url,
                     delay=5,
                     cache=None,
                     scrape_callback=None,
                     user_agent='wswp',
                     proxies=None,
                     num_retries=1,
                     max_threads=10,
                     timeout=60):
    """Crawl using multiple threads
    """
    # the queue of URL's that still need to be crawled
    crawl_queue = MongoQueue()
    crawl_queue.clear()
    crawl_queue.push(seed_url)
    D = Downloader(cache=cache,
                   delay=delay,
                   user_agent=user_agent,
                   proxies=proxies,
                   num_retries=num_retries,
                   timeout=timeout)

    def process_queue():
        while True:
            # keep track that are processing url
            try:
                url = crawl_queue.pop()
            except KeyError:
                # currently no urls to process
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print('Error in callback for: {}: {}'.format(url, e))
                    else:
                        for link in links:
                            # add this new link to queue
                            crawl_queue.push(normalize(seed_url, link))
                crawl_queue.complete(url)

    # wait for all download threads to finish
    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue.peek():
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(
                True
            )  # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)

Пример #5

Показать файл

def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60):
    """Crawl this website in multiple threads
    """
    # the queue of URL's that still need to be crawled
    #crawl_queue = Queue.deque([seed_url])
    print(seed_url)
    crawl_queue = [seed_url]
    # the URL's that have been seen 
    seen = set([seed_url]) # a set
    D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout)

    def process_queue():
        while True:
            try:
                url = crawl_queue.pop()
            except IndexError:
                # crawl queue is empty
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print('Error in callback for: {}: {}'.format(url, e))
                    else:
                        for link in links:
                            link = normalize(seed_url, link)
                            # check whether already crawled this link
                            if link not in seen:
                                seen.add(link)
                                # add this new link to queue
                                crawl_queue.append(link)


    # wait for all download threads to finish
    threads = []
    while threads or crawl_queue:
        # the crawl is still active
        for thread in threads:
            if not thread.is_alive():
                # remove the stopped threads
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue:
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        # all threads have been processed
        # sleep temporarily so CPU can focus execution on other threads
        time.sleep(SLEEP_TIME)

Пример #6

Показать файл

def alexa():
    D = Downloader()
    zipped_data = D('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip')
    print(type(zipped_data))
    urls = []  # top 1 million URL's will be stored in this list
    with ZipFile(BytesIO(zipped_data)) as zf:
        csv_filename = zf.namelist()[0]
        print(csv_filename)
        mess = zf.open(csv_filename)
        for website in mess.readlines():
            webstr = website.decode('utf-8').replace("\n", "").split(',')[1]
            print(webstr)
            urls.append('http://' + webstr)
    return urls

Пример #7

Показать файл

def alexa2():
    D = Downloader()
    zipped_data = D('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip')
    print(type(zipped_data))
    urls = []  # top 1 million URL's will be stored in this list
    with ZipFile(BytesIO(zipped_data)) as zf:
        csv_filename = zf.namelist()[0]
        print(csv_filename)
        mess = zf.open(csv_filename, 'r')
        mess = TextIOWrapper(mess)
        print(mess)
        for _, website in csv.reader(mess):
            print(_, website)
            urls.append('http://' + website)
    return urls

Пример #8

Показать файл

Файл: link_crawler.py Проект: motein/NetSpiderCode

def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, user_agent='wswp', proxies=None, num_retries=1, scrape_callback=None, cache=None, ignore_robots=False):
    """Crawl from the given seed URL following links matched by link_regex
    """
    # the queue of URL's that still need to be crawled
    crawl_queue = [seed_url]
    # the URL's that have been seen and at what depth
    seen = {seed_url: 0}
    # track how many URL's have been downloaded
    num_urls = 0
    rp = get_robots(seed_url)
    D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, cache=cache)

    while crawl_queue:
        url = crawl_queue.pop()
        depth = seen[url]
        # check url passes robots.txt restrictions
        if ignore_robots or rp.can_fetch(user_agent, url):
            html = D(url)
            links = []
            if scrape_callback:
                links.extend(scrape_callback(url, html) or [])

            if depth != max_depth:
                # can still crawl further
                if link_regex:
                    # filter for links matching our regular expression
                    links.extend(link for link in get_links(html) if re.match(link_regex, link))

                for link in links:
                    link = normalize(seed_url, link)
                    # check whether already crawled this link
                    if link not in seen:
                        seen[link] = depth + 1
                        # check link is within same domain
                        if same_domain(seed_url, link):
                            # success! add this new link to queue
                            crawl_queue.append(link)

            # check whether have reached downloaded maximum
            num_urls += 1
            if num_urls == max_urls:
                break
        else:
            print('Blocked by robots.txt:', url)

Пример #9

Показать файл

Файл: search.py Проект: motein/NetSpiderCode

def direct_download_ajax():
    D=Downloader()
    html=D('http://example.webscraping.com/ajax/')
    content=json.loads(html.decode('utf-8'))
    print(content)

Пример #10

Показать файл

Файл: search.py Проект: motein/NetSpiderCode

def fail_search():
    D=Downloader()
    html=D('http://example.webscraping.com/search')
    tree=lxml.html.fromstring(html)
    tree.cssselect('div#results a')