Python Spider.startCrawl примеры использования

Язык программирования: Python

Пространство имен/Пакет: spider

Класс/Тип: Spider

Метод/Функция: startCrawl

Примеров на hotexamples.com: 1

Python Spider.startCrawl - 1 пример найден. Это лучшие примеры Python кода для spider.Spider.startCrawl, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Spider(30)

crawl_page(30)

crawl(14)

__init__(8)

craw(4)

Search(4)

crawl_genre(3)

build_node(3)

analyse(3)

process_page(2)

court(2)

add_url(2)

content_list(2)

GetInfo(2)

crowl(1)

crowl_page(1)

GET(1)

crawled_page(1)

createResultExcel(1)

get2l_url(1)

crawledPage(1)

crawle_page_in_queue(1)

crawl_weather(1)

crawl_video_urls(1)

crawl_robots(1)

data(1)

getfilename(1)

get3l_url(1)

post(1)

update(1)

startCrawl(1)

setworkdir(1)

setfilename(1)

setDaemon(1)

responseCallback(1)

parse_blog(1)

getSoup(1)

linkCallback(1)

levelCallback(1)

is_valid(1)

is_outgoing(1)

htmlCallback(1)

get_pdfs(1)

crawl_page_graph(1)

crawl_async_slots(1)

crawl_next_page_from_queue(1)

authorized(1)

Process(1)

ReturnValues(1)

Text(1)

Пример #1

Показать файл

Файл: main.py Проект: hybridtheorylink/spider

from spider import Spider
import _thread
import imageUtils
import strUtils
from lxml import etree
import webUtils

def downloadImg(url,nodes):
    imgs = nodes.xpath('//img/@src')
    for img in imgs:
        if(webUtils.adjustUrl(url,img) is None):continue
        try:
            _thread.start_new_thread(imageUtils.downloadImage,(img,))
        except Exception as e:
            print(e)

startUrsl = ['https://movie.douban.com/subject/3569910/']
netlocs = 'douban.com'
allows = ['.*douban.*']


spider1 = Spider(startUrls=startUrsl,netlocs=netlocs,allows=allows,callback=downloadImg)

spider1.startCrawl()