def _load_starturl_from_schedule(self): try: start_urls = self._task_schedule.get(block=False) name = start_urls[0] start_url = start_urls[1] # logger.debug("当前爬取的网页是:%s"%start_urls) logger.info(*self.lfm.crawled( "CrawlerRunner", self.name, '当前爬取的网页',start_url) ) crawler = Crawler(self.spidercls, self.settings,self.lfm,self,self.middlewares) crawler.create_spider_from_task(name,start_url) return crawler except Empty: logger.debug(*self.lfm.crawled( "CrawlerRunner", self.name, '队列中的task分配完毕') ) if not self._push_task_finish: self._create_task() else: self._pull_task_finish = True except Exception as e : logger.error(*self.lfm.error("CrawlerRunner",self.name, "", '出现错误:',), extra= { 'exception': e, }, exc_info=True) return None
# # print(tradingday) # if data_Text == 'IF1810': # print(data_Text+":"+volume) # self.total_sale_1810 += int(volume) # # if data_Text == 'IF1811': # # print(data_Text+":"+volume) # self.total_sale_1811 += int(volume) # # if data_Text == 'IF1812': # # print(data_Text+":"+volume) # self.total_sale_1812 += int(volume) # # if data_Text == 'IF1903': # # print(data_Text+":"+volume) # self.total_sale_1903 += int(volume) # # if data_Text == 'IF1906': # # print(data_Text+":"+volume) # self.total_sale_1906 += int(volume) return None if __name__ == '__main__': settings = Setting() crawler_01 = Crawler(Cffex_Rank, settings) c1 = crawler_01.crawl() c1.addBoth(lambda _: reactor.stop()) reactor.run()
def request_errback(content): print("request_and_response errback") print(content[1]) return content def agent_print(content): print("agent_print") print(type(content)) print(content) request = Request(url=url,callback=request_callback,method='get', headers=headers,errback=request_errback,meta={"download_timeout":2}) settings = Setting() crawler = Crawler(LJSpider,settings) spider = crawler._create_spider() downloader = Downloader(crawler) """ httphandler = HTTPDownloadHandler(settings) agent = httphandler.download_request(request,spider) agent.addCallback(agent_print) agent.addErrback(request_errback) """ agent = downloader.fetch(request,spider) agent.addCallback(request_callback) agent.addBoth(lambda _: reactor.stop()) reactor.run()
from test.framework.setting import Setting from test.framework.core.crawler import Crawler from twisted.internet import reactor, defer import logging from test.framework.test.test_example.check_spidermw.simple_spider_spidermw import LJSpiderMw def finish_crawl(content, spider): logging.info("finish===>%d" % spider._item_num) return settings = Setting() crawler_01 = Crawler(LJSpiderMw, settings) c1 = crawler_01.crawl() dd = defer.DeferredList([c1]) dd.addCallback(finish_crawl, crawler_01.spider) dd.addBoth(lambda _: reactor.stop()) reactor.run()
def request_errback(content): print("request_and_response errback") print(content[1]) return content def agent_print(content): print("agent_print") print(type(content)) print(content) request = Request(url=url,callback=request_callback,method='get', headers=headers,errback=request_errback,meta={"download_timeout":2}) settings = Setting() crawler = Crawler(Spider1,settings) spider = crawler._create_spider() downloader = Downloader(crawler) """ httphandler = HTTPDownloadHandler(settings) agent = httphandler.download_request(request,spider) agent.addCallback(agent_print) agent.addErrback(request_errback) """ agent = downloader.fetch(request,spider) agent.addCallback(request_callback) agent.addCallback(get_smzdm_datas) agent.addCallback(print_smzdm_result,url) agent.addBoth(lambda _: reactor.stop())
from test.framework.spider import Test_Spider_2 from test.framework.spider.test_spider.test_Spider_03 import Test_Spider_3 from test.framework.setting import Setting from test.framework.core.crawler import Crawler from test.framework.spider.test_spider.test_Spider_01 import Test_Spider_1 from twisted.internet import reactor, defer import logging LOG_FORMAT = '%(asctime)s-%(filename)s[line:%(lineno)d]-%(levelname)s: %(message)s' DATE_FORMAT = "%m/%d/%Y %H:%M:%S %p" logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, datefmt=DATE_FORMAT) def finish_crawl(content): logging.info("finish") return content settings = Setting() crawler_01 = Crawler(Test_Spider_1, settings) crawler_02 = Crawler(Test_Spider_2, settings) crawler_03 = Crawler(Test_Spider_3, settings) spider_01 = crawler_01._create_spider() spider_02 = crawler_02._create_spider() spider_03 = crawler_03._create_spider() c1 = crawler_01.crawl() c2 = crawler_02.crawl() c3 = crawler_03.crawl() dd = defer.DeferredList([c1, c2, c3]) dd.addBoth(lambda _: reactor.stop()) reactor.run()
from test.framework.setting import Setting from test.framework.core.crawler import Crawler from twisted.internet import reactor, defer import logging from test.framework.test.test_example.agent_proxy.simple_spider_proxy import LJSpiderProxy def finish_crawl(content, spider): logging.info("finish===>%d" % spider._item_num) return settings = Setting() crawler_01 = Crawler(LJSpiderProxy, settings) c1 = crawler_01.crawl() dd = defer.DeferredList([c1]) dd.addCallback(finish_crawl, crawler_01.spider) dd.addBoth(lambda _: reactor.stop()) reactor.run()
i = str(i) u = url + i start_url.append(u) for url in start_url: #print(url) yield Request(url) def print_err(content): print(content) return content settings = Setting() crawler_01 = Crawler(Test_Spider_1, settings) crawler_02 = Crawler(Test_Spider_2, settings) crawler_03 = Crawler(Test_Spider_3, settings) spider1 = crawler_01._create_spider() spider2 = crawler_02._create_spider() spider3 = crawler_03._create_spider() engine_01 = ExecutionEngine(crawler_01, finish_crawl) engine_02 = ExecutionEngine(crawler_01, finish_crawl) engine_03 = ExecutionEngine(crawler_01, finish_crawl) start_requests = [start_request_01(), start_request_02(), start_request_03()] engines = [engine_01, engine_02, engine_03] downloads = [] for start_request, engine in zip(start_requests, engines): #(request.url) engine.start()
if re.match(self.instrument_bu, instrument): result_temp = instrument + ':' + str(rank) + ':' + str( volume) result[b(self.instrument_bu)].append(result_temp) if re.match(self.instrument_ru, instrument): result_temp = instrument + ':' + str(rank) + ':' + str( volume) result[b(self.instrument_ru)].append(result_temp) if re.match(self.instrument_sp, instrument): result_temp = instrument + ':' + str(rank) + ':' + str( volume) result[b(self.instrument_sp)].append(result_temp) if re.match(self.instrument_nr, instrument): result_temp = instrument + ':' + str(rank) + ':' + str( volume) result[b(self.instrument_nr)].append(result_temp) self.total_result[time[0]] = result return None if __name__ == '__main__': settings = Setting() crawler_01 = Crawler(SHFE_Rank, settings) c1 = crawler_01.crawl() c1.addBoth(lambda _: reactor.stop()) reactor.run()
import logging LOG_FORMAT = '%(asctime)s-%(filename)s[line:%(lineno)d]-%(levelname)s: %(message)s' DATE_FORMAT = "%m/%d/%Y %H:%M:%S %p" logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, datefmt=DATE_FORMAT) # mongodb服务的地址和端口号 mongo_url = "127.0.0.1:27017" # 连接到mongodb,如果参数不填,默认为“localhost:27017” client = pymongo.MongoClient(mongo_url) #连接到数据库myDatabase DATABASE = "LianJia" db = client[DATABASE] #连接到集合(表):myDatabase.myCollection COLLECTION = "XiaoQu" db_coll = db[COLLECTION] projectionFields = {'_id': False} # 用字典指定 queryArgs = {"total_zone_name": "pudong"} searchRes = db_coll.find(queryArgs, projectionFields) scheduler = searchRes.next() settings = Setting() crawler_01 = Crawler(Part_Zone, settings) crawler_01._create_spider_schedule(scheduler) c1 = crawler_01.crawl() c1.addBoth(lambda _: reactor.stop()) reactor.run()
import logging LOG_FORMAT = '%(asctime)s-%(filename)s[line:%(lineno)d]-%(levelname)s: %(message)s' DATE_FORMAT = "%m/%d/%Y %H:%M:%S %p" logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, datefmt=DATE_FORMAT) # mongodb服务的地址和端口号 mongo_url = "127.0.0.1:27017" # 连接到mongodb,如果参数不填,默认为“localhost:27017” client = pymongo.MongoClient(mongo_url) #连接到数据库myDatabase DATABASE = "LianJia" db = client[DATABASE] #连接到集合(表):myDatabase.myCollection COLLECTION = "XiaoQu" db_coll = db[COLLECTION] projectionFields = {'_id': False} # 用字典指定 queryArgs = {"total_zone_name": "pudong"} searchRes = db_coll.find(queryArgs, projectionFields) scheduler = searchRes.next() settings = Setting() crawler_01 = Crawler(Part_Zone, settings) crawler_01._create_spider() c1 = crawler_01.crawl() c1.addBoth(lambda _: reactor.stop()) reactor.run()
from test.framework.core.crawler import Crawler, _get_spider_loader from twisted.internet.defer import DeferredList from twisted.internet import reactor from test.framework.setting import Setting s = Setting() cls = _get_spider_loader(s) _active = set() for name, module in cls._spiders.items(): crawler = Crawler(module, s) #spider = crawler._create_spider() d = crawler.crawl() _active.add(d) dd = DeferredList(_active) if dd.called: print("have called") #dd.addCallback(lambda _ :reactor.callLater(5,crawler.stop)) #dd.addCallback(crawler.stop) dd.addBoth(lambda _: reactor.stop()) reactor.run()
from test.framework.setting import Setting from test.framework.core.crawler import Crawler from test.framework.spider.test_spider.test_Spider_01 import Test_Spider_1 from twisted.internet import reactor, defer import logging LOG_FORMAT = '%(asctime)s-%(filename)s[line:%(lineno)d]-%(levelname)s: %(message)s' DATE_FORMAT = "%m/%d/%Y %H:%M:%S %p" logging.basicConfig(level=logging.INFO,format=LOG_FORMAT,datefmt=DATE_FORMAT) def finish_crawl( content): logging.info("finish") return content settings = Setting() crawler_01 = Crawler(Test_Spider_1,settings) spider_01 = crawler_01._create_spider() c1 = crawler_01.crawl() dd = defer.DeferredList([c1]) dd.addBoth(lambda _:reactor.stop()) reactor.run()