Python Crawler._create_spider 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: test.framework.core.crawler

클래스/타입: Crawler

메소드/함수: _create_spider

hotexamples.com에서의 예제들: 3

Python Crawler._create_spider - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 test.framework.core.crawler.Crawler._create_spider에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Crawler(11)

_create_spider(3)

crawl(3)

_create_spider_schedule(1)

create_spider_from_task(1)

예제 #1

파일 보기

파일: test_for_engine_01.py 프로젝트: lalacat/crawler

from test.framework.spider import Test_Spider_2
from test.framework.spider.test_spider.test_Spider_03 import Test_Spider_3
from test.framework.setting import Setting
from test.framework.core.crawler import Crawler
from test.framework.spider.test_spider.test_Spider_01 import Test_Spider_1
from twisted.internet import reactor, defer
import logging
LOG_FORMAT = '%(asctime)s-%(filename)s[line:%(lineno)d]-%(levelname)s: %(message)s'
DATE_FORMAT = "%m/%d/%Y %H:%M:%S %p"
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, datefmt=DATE_FORMAT)


def finish_crawl(content):
    logging.info("finish")
    return content


settings = Setting()
crawler_01 = Crawler(Test_Spider_1, settings)
crawler_02 = Crawler(Test_Spider_2, settings)
crawler_03 = Crawler(Test_Spider_3, settings)
spider_01 = crawler_01._create_spider()
spider_02 = crawler_02._create_spider()
spider_03 = crawler_03._create_spider()
c1 = crawler_01.crawl()
c2 = crawler_02.crawl()
c3 = crawler_03.crawl()
dd = defer.DeferredList([c1, c2, c3])
dd.addBoth(lambda _: reactor.stop())
reactor.run()

예제 #2

파일 보기

def request_errback(content):
    print("request_and_response errback")
    print(content[1])
    return content

def agent_print(content):
    print("agent_print")
    print(type(content))
    print(content)

request = Request(url=url,callback=request_callback,method='get',
                  headers=headers,errback=request_errback,meta={"download_timeout":2})

settings = Setting()
crawler = Crawler(LJSpider,settings)
spider = crawler._create_spider()
downloader = Downloader(crawler)

"""
httphandler = HTTPDownloadHandler(settings)
agent = httphandler.download_request(request,spider)
agent.addCallback(agent_print)
agent.addErrback(request_errback)
"""
agent = downloader.fetch(request,spider)
agent.addCallback(request_callback)
agent.addBoth(lambda _: reactor.stop())

reactor.run()

예제 #3

파일 보기

파일: test_crawl_03_lianjia_allzone_xiaoqu.py 프로젝트: lalacat/crawler

import logging
LOG_FORMAT = '%(asctime)s-%(filename)s[line:%(lineno)d]-%(levelname)s: %(message)s'
DATE_FORMAT = "%m/%d/%Y %H:%M:%S %p"
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, datefmt=DATE_FORMAT)

# mongodb服务的地址和端口号
mongo_url = "127.0.0.1:27017"

# 连接到mongodb，如果参数不填，默认为“localhost:27017”
client = pymongo.MongoClient(mongo_url)

#连接到数据库myDatabase
DATABASE = "LianJia"
db = client[DATABASE]

#连接到集合(表):myDatabase.myCollection
COLLECTION = "XiaoQu"
db_coll = db[COLLECTION]

projectionFields = {'_id': False}  # 用字典指定
queryArgs = {"total_zone_name": "pudong"}

searchRes = db_coll.find(queryArgs, projectionFields)
scheduler = searchRes.next()

settings = Setting()
crawler_01 = Crawler(Part_Zone, settings)
crawler_01._create_spider()
c1 = crawler_01.crawl()
c1.addBoth(lambda _: reactor.stop())
reactor.run()