def crawler_start(usage, tasks): """Start specified spiders or validators from cmd with scrapy core api. There are four kinds of spiders: common, ajax, gfw, ajax_gfw. If you don't assign any tasks, all these spiders will run. """ maps = CRAWLER_TASK_MAPS if usage == 'crawler' else TEMP_TASK_MAPS origin_spiders = DEFAULT_CRAWLERS if usage == 'crawler' else DEFAULT_VALIDATORS if not tasks: spiders = origin_spiders else: spiders = list() cases = list(map(BaseCase, origin_spiders)) for task in tasks: for case in cases: if case.check(task, maps): spiders.append(case.spider) break else: # crawler_logger.warning('spider task {} is an invalid task, the allowed tasks are {}'.format( # task, list(maps.keys()))) pass if not spiders: #crawler_logger.warning('no spider starts up, please check your task input') return settings = get_project_settings() configure_logging(settings) runner = CrawlerRunner(settings) for spider in spiders: runner.crawl(spider) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def run_spider(): options = { 'CONCURRENT_ITEMS': 250, 'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)', 'CONCURRENT_REQUESTS': 30, 'DOWNLOAD_DELAY': 0.5, 'COOKIES_ENABLED': False, } settings = get_project_settings() configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) settings.update(options); #BookToscrapeSpider basic version from tutorial.spiders.booktoscrape_basic import BookToscrapeSpider #runner = CrawlerRunner(settings) #runner.crawl(BookToscrapeSpider()) #BookToscrapeSpider crawl version from tutorial.spiders.booktoscrape_crawl import BookToscrapeSpider as BookToscrapeSpider_crawl runner = CrawlerRunner(settings) runner.crawl(BookToscrapeSpider_crawl()) #crawler = Crawler(settings) #crawler.signals.connect(reactor.stop, signal=signals.spider_closed) #crawler.install() #crawler.configure() #crawler.crawl(spider) #crawler.start() #log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False) d= runner.join() d.addBoth(lambda _:reactor.stop()) reactor.run()
def run(): options = { 'CONCURRENT_ITEMS': 250, #'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)', 'CONCURRENT_REQUESTS': 30, 'DOWNLOAD_DELAY': 0.5, 'COOKIES_ENABLED': False, } spider = EntertainmentcareersSpider() settings = get_project_settings() settings.update(options) runner= CrawlerRunner(settings) runner.crawl(spider) d= runner.join() d.addBoth(lambda _:reactor.stop()) #crawler = Crawler(settings) #crawler.signals.connect(reactor.stop, signal=signals.spider_closed) #crawler.install() #crawler.configure() #crawler.crawl(spider) #crawler.start() #log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False) reactor.run()
def crawl_articles(spids): settings = get_project_settings() configure_logging(settings, install_root_handler=False) logging.getLogger('scrapy').setLevel(logging.WARNING) runner = CrawlerRunner(settings) loader = runner.spider_loader if 'all' in spids: spids = loader.list() spiders = [ loader.load(spid) for spid in spids if spid in loader.list() ] if not spiders: return random.shuffle(spiders) for spider in spiders: runner.crawl(spider) d = runner.join() d.addBoth(lambda _: reactor.stop()) logger.info('crawl job starting...') try: reactor.run() except Exception: logger.exception('crawl job got exception:') logger.info('crawl job finished')
def test_start_requests_dupes(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler(DuplicateStartRequestsSpider) yield crawler.crawl(dont_filter=True, distinct_urls=2, dupe_factor=3, mockserver=self.mockserver) self.assertEqual(crawler.spider.visited, 6) yield crawler.crawl(dont_filter=False, distinct_urls=3, dupe_factor=4, mockserver=self.mockserver) self.assertEqual(crawler.spider.visited, 3)
def runProcess(self): configure_logging() dbHandler.check_watches() runner = CrawlerRunner() runner.crawl(spider.available_courses_spider) dbHandler.check_watches() d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def runSpider(host, spider): spiders = spider.split(',') changeSettings(host) settings = get_project_settings() runner = CrawlerRunner(settings) for i in spiders: runner.crawl(SPIDER_MATCHER[i.lower()]) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def test_timeout_failure(self): crawler = CrawlerRunner({"DOWNLOAD_TIMEOUT": 0.35}).create_crawler(DelaySpider) yield crawler.crawl(n=0.5, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) # server hangs after receiving response headers yield crawler.crawl(n=0.5, b=1, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1)
def test_crawler_runner_bootstrap_failed_for_several(self): runner = CrawlerRunner() try: yield runner.crawl(ExceptionSpider) except ValueError: pass else: self.fail('Exception should be raised from spider') yield runner.crawl(NoRequestsSpider) self.assertEqual(runner.bootstrap_failed, True)
def startprocess(queue): runner = CrawlerRunner(get_project_settings()) dfs = set() l = runner.crawl('linkspider', website='http://caffe.berkeleyvision.org/', domain='berkeleyvision.org').addCallback(test,queue) #回调函数中参数1表示linkspider dfs.add(l) s = runner.crawl('srcspider', website='http://caffe.berkeleyvision.org/', domain='berkeleyvision.org').addCallback(test,queue)#回调函数中参数2表示srcspider dfs.add(s) c = runner.crawl('codespider', website='http://caffe.berkeleyvision.org/', domain='berkeleyvision.org').addCallback(test,queue)#回调函数中参数3表示codespider dfs.add(c) defer.DeferredList(dfs).addBoth(lambda _: reactor.stop()) # the script will block here until all crawling jobs are finished reactor.run()
def handle_lj(self): configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) runner = CrawlerRunner(crawler_setting) #d = runner.crawl(HouseSpider) d = runner.crawl(LianjiaHouseSpider) d.addBoth(lambda _: reactor.stop()) reactor.run()
def test_crawler_process(self): runner = CrawlerRunner(self.settings) d = runner.crawl(CustomSpider) d.addBoth(lambda _: reactor.stop()) # add crawl to redis key = "test-spider:istresearch.com:queue" self.redis_conn.zadd(key, self.example_feed, -99) # run the spider, give 20 seconds to see the url, crawl it, # and send to kafka. Then we kill the reactor def thread_func(): time.sleep(20) reactor.stop() thread = threading.Thread(target=thread_func) thread.start() reactor.run() message_count = 0 m = next(self.consumer) if m is None: pass else: the_dict = json.loads(m.value) if the_dict is not None and the_dict['appid'] == 'test' \ and the_dict['crawlid'] == 'abc12345': message_count += 1 self.assertEquals(message_count, 1)
def main(): configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) settings = get_project_settings() runner = CrawlerRunner(settings) # settings.set('FEED_FORMAT','json') # settings.set('FEED_URI', 'result.json') runner.crawl(PttBoard) runner.crawl(PTTArticle) d = runner.join() d.addBoth(lambda _: reactor.stop()) result = reactor.run() # the script will block here until the crawling is finished print result
class CrawlerRunnerTest(unittest.TestCase): def setUp(self): self.crawler_runner = CrawlerRunner(Settings()) def tearDown(self): return self.crawler_runner.stop() @defer.inlineCallbacks def test_populate_spidercls_settings(self): spider_settings = {'TEST1': 'spider', 'TEST2': 'spider'} project_settings = {'TEST1': 'project', 'TEST3': 'project'} class CustomSettingsSpider(DefaultSpider): custom_settings = spider_settings self.crawler_runner.settings.setdict(project_settings, priority='project') d = self.crawler_runner.crawl(CustomSettingsSpider) crawler = list(self.crawler_runner.crawlers)[0] yield d self.assertEqual(crawler.settings.get('TEST1'), 'spider') self.assertEqual(crawler.settings.get('TEST2'), 'spider') self.assertEqual(crawler.settings.get('TEST3'), 'project')
def run_crawler_by_runner(): runner = CrawlerRunner(get_project_settings()) [runner.crawl(spider) for spider in spiders] d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def test_same_url(self): class TestSameUrlSpider(Spider): name = 'test_same_url' def __init__(self, *args, **kwargs): super(TestSameUrlSpider, self).__init__(*args, **kwargs) self.visited = 0 def start_requests(s): return self.conman.from_spider(s, self.results) def parse_first(self, response): self.visited += 1 return TestItem() def parse_second(self, response): self.visited += 1 return TestItem() with MockServer() as mockserver: contract_doc = '@url {}'.format(mockserver.url('/status?n=200')) get_unbound_function(TestSameUrlSpider.parse_first).__doc__ = contract_doc get_unbound_function(TestSameUrlSpider.parse_second).__doc__ = contract_doc crawler = CrawlerRunner().create_crawler(TestSameUrlSpider) yield crawler.crawl() self.assertEqual(crawler.spider.visited, 2)
def crawl(self): spider = Scrapy_ModuleSpider() Runner = CrawlerRunner(self.Scrapy_Module_setting) cra = Runner.crawl(spider) # stop reactor when spider closes cra.addBoth(lambda _: self.spider_closing(cra)) self.logger.info("Run reactor") reactor.run()
def webcrawl(queue,webs,dom): website = '' domain = '' try: runner = CrawlerRunner(get_project_settings()) dfs = set() l = runner.crawl('linkspider', website=webs, domain=dom).addCallback(setflag, queue).addErrback(err, queue) #回调函数中参数1表示linkspider dfs.add(l) s = runner.crawl('srcspider', website=webs, domain=dom).addCallback(setflag, queue).addErrback(err, queue)#回调函数中参数2表示srcspider dfs.add(s) c = runner.crawl('codespider', website=webs, domain=dom).addCallback(setflag, queue).addErrback(err, queue)#回调函数中参数3表示codespider dfs.add(c) defer.DeferredList(dfs).addBoth(lambda _: reactor.stop()) # the script will block here until all crawling jobs are finished reactor.run() except Exception,e: print e
def runSpider(self, spider): configure_logging({'LOG_FORMAT': '%(asctime)s [%(name)s] %(levelname)s: %(message)s'}) settings = Settings() settings.set('FEED_URI', 'output.json') settings.set('FEED_FORMAT', 'json') runner = CrawlerRunner(settings) dfd = runner.crawl(spider) dfd.addBoth(lambda _: reactor.stop())
class Runner(object): def __init__(self,*args,**kwargs): configure_logging() self.settings = get_project_settings() self.runner = CrawlerRunner(self.settings) def add(self,*a,**kw): crawler = Crawler(BroadSpider,self.settings) self.runner.crawl(crawler,*a,**kw) def start(self): d = self.runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run() def stop(self): self.runner.stop() reactor.stop()
def _test_delay(self, delay, randomize): settings = {"DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize} crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider) yield crawler.crawl(maxlatency=delay * 2, mockserver=self.mockserver) t = crawler.spider.times totaltime = t[-1] - t[0] avgd = totaltime / (len(t) - 1) tolerance = 0.6 if randomize else 0.2 self.assertTrue(avgd > delay * (1 - tolerance), "download delay too small: %s" % avgd)
def main(): locale.setlocale(locale.LC_TIME, 'es_ES') configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) runner = CrawlerRunner() d = runner.crawl(LotoSpider) d.addBoth(lambda _: reactor.stop()) reactor.run() return None
def run_login_spider(seed_url, username, password, db_name, logfile = "results.log"): init_db(db_name) settings = get_project_settings() runner = CrawlerRunner(settings) d = runner.crawl(LoginFinderSpider, seed_url = seed_url, username = username, password = password) d.addBoth(lambda _: reactor.stop()) log.start(loglevel=log.DEBUG, logfile=logfile) log.msg("Item pipelines enabled: %s" % str(settings.get("ITEM_PIPELINES")), level = log.INFO) reactor.run()
def crawl(self): os.environ['SCRAPY_PROJECT'] = '{0}/{1}'.format(BASE_DIR, 'collector') runner = CrawlerRunner({'LOG_LEVEL': 'WARNING', 'LOG_FORMATTER': 'collector.collector.utils.PoliteLogFormatter', 'ITEM_PIPELINES': { 'collector.collector.pipelines.CodingDuplicatesPipeline': 1, 'collector.collector.pipelines.CodingPriorityPipeline': 2 }}) # runner = CrawlerRunner() d = runner.crawl(CodingProjectSpider) d.addBoth(lambda _: reactor.stop()) reactor.run()
def run_and_export(self, spider_cls, settings=None): """ Run spider with specified settings; return exported data. """ tmpdir = tempfile.mkdtemp() res_name = tmpdir + "/res" defaults = {"FEED_URI": "file://" + res_name, "FEED_FORMAT": "csv"} defaults.update(settings or {}) try: with MockServer() as s: runner = CrawlerRunner(Settings(defaults)) yield runner.crawl(spider_cls) with open(res_name, "rb") as f: defer.returnValue(f.read()) finally: shutil.rmtree(tmpdir)
def perform_scrape(): '''Perform a MunchSpider scrape using the current Scrapy Settings ''' settings = scrapingtools.get_settings() publisher_database = get_publisher_database(settings,mongo=False) configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) (doi_links,doi_sources) = get_joblist(settings.get('COLLECT_FILE_NAME')) domains = get_domains(publisher_database) runner=CrawlerRunner(settings) d=runner.crawl(Spiders.MunchSpider.MunchSpider, start_urls=doi_links, crossref_items = doi_sources, allowed_domains=domains, publisher_database=publisher_database, ) d2=d.addBoth(lambda _: reactor.stop()) d2.addCallback(lambda _: scrapingtools.finalise_file(settings.get('COMPLETE_FILE_NAME'))) d2.addCallback(lambda _: scrapingtools.finalise_file(settings.get('ERROR_FILE_NAME')))
def _run_feed_spider(url, feed): spid = str(uuid.uuid4()) feed['_id'] = spid configure_logging(TEST_SETTINGS, install_root_handler=False) logging.getLogger('scrapy').setLevel(logging.WARNING) save_feed(url) cls = SpiderFactory.create_spider(feed) runner = CrawlerRunner(TEST_SETTINGS) d = runner.crawl(cls) d.addBoth(lambda _: reactor.stop()) reactor.run(installSignalHandlers=False) n = get_stats([spid])[spid] if n == 0: raise Exception(f'feed spider crawled 0 articles') if is_exists_spider(url): raise Exception(f'feed[{url}] existed') del feed['_id'] save_spider_settings(feed)
def run_spiders(): """ 说明: 如果该调用程序是程序的最外层循环,那么此处可以直接调用爬虫的配置文件: 在文件中使用如下代码: from scrapy.utils.project import get_project_settings # some code runner = CrawlerRunner(get_project_settings()) 如果该程序调用只是一个封装的函数,则配置文件需要自己构造,如下面的代码 """ configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) # 定义日志格式 # 设置当前爬虫的配置信息,此处是选择要调用的pipe settings = Settings() settings.set('ITEM_PIPELINES', {'spider.tutorial.pipelines.TutorialPipeline': 300,}) # 将加载后的配置文件加载到爬虫中 runner = CrawlerRunner(settings) # 启用爬虫运行器 d = runner.crawl(ChinazSpider) d.addBoth(lambda _: reactor.stop()) reactor.run() # the script will block here until the crawling is finished
def run_and_export(self, spider_cls, settings=None): """ Run spider with specified settings; return exported data. """ tmpdir = tempfile.mkdtemp() res_path = os.path.join(tmpdir, 'res') res_uri = urljoin('file:', pathname2url(res_path)) defaults = { 'FEED_URI': res_uri, 'FEED_FORMAT': 'csv', } defaults.update(settings or {}) try: with MockServer() as s: runner = CrawlerRunner(Settings(defaults)) spider_cls.start_urls = [s.url('/')] yield runner.crawl(spider_cls) with open(res_path, 'rb') as f: content = f.read() finally: shutil.rmtree(tmpdir, ignore_errors=True) defer.returnValue(content)
# coding: utf-8 from __future__ import unicode_literals from __future__ import absolute_import import sys import os import logging from settings import BASE_DIR, USER_AGENTS # PRO_PATH = '{0}/{1}'.format(BASE_DIR, 'shadow') sys.path.append(BASE_DIR) os.environ['SCRAPY_PROJECT'] = BASE_DIR from Shadow.spiders.zhihu_spider import ZHPeopleColumnSpider from scrapy.conf import settings from twisted.internet import reactor from scrapy.crawler import CrawlerRunner settings.overrides.update({'USER_AGENTS': USER_AGENTS}) count = 1 while 1: logging.info('start crawl people column by {0} times'.format(count)) process = CrawlerRunner(settings) d = process.crawl(ZHPeopleColumnSpider) d.addBoth(lambda _: reactor.stop()) reactor.run()
newFp.close() fp.close() def transfer(self,a): b='' for i in a.split('\\'): if(len(i) == 0): continue if(i[0] == 't'): b += '\t' if(len(i)>1): b += i[1:] continue if(i[0] =='n'): b += '\n' if(len(i)>1): b += i[1:] continue if(i[0] == 'r'): b += '\r' if(len(i)>1): b += i[1:] continue b += i return b runner = CrawlerRunner() d = runner.crawl(MySpider) d.addBoth(lambda _: reactor.stop())
def scrape_with_crochet(spider): global data data = [] crawl_runner = CrawlerRunner() dispatcher.connect(process_result, signal=item_scraped) return crawl_runner.crawl(spider)
def handle(self, *args, **options): configure_logging() process = CrawlerRunner(get_project_settings()) # film crawlers process.crawl('acfilm') process.crawl('fpp') process.crawl('retrospekt') process.crawl('brooklyn-film') process.crawl('precision-film') process.crawl('bhfilm') process.crawl('freestyle') process.crawl('moment') process.crawl('ultrafine') # camera crawlers process.crawl('brooklyn') process.crawl('austin_camera') process.crawl('precision') process.crawl('keh') process.crawl('bh') # not super impressed with etsy, tbh # process.crawl('etsy') d = process.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
#!/usr/bin/env python # -*- encoding: utf-8 -*- """ Topic: sample Desc : """ import logging from spiders.user_relationship_nets import UserRelationshipNetsSpider from twisted.internet import reactor from scrapy.crawler import CrawlerRunner from scrapy.utils.project import get_project_settings from scrapy.utils.log import configure_logging if __name__ == '__main__': settings = get_project_settings() configure_logging(settings) runner = CrawlerRunner(settings) runner.crawl(UserRelationshipNetsSpider, 'xrcy168', runner) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run() logging.info('all finished.')
from scrapy.crawler import CrawlerRunner from scrapy.utils.project import get_project_settings from scrapy.utils.log import configure_logging from mengEventProject.models import db_connect from mengEventProject.models import ArticleRule from sqlalchemy.orm import sessionmaker from mengEventProject.spiders.articleSpider import ArticleSpider if __name__ == '__main__': settings = get_project_settings() configure_logging(settings) db = db_connect() Session = sessionmaker(bind=db) session = Session() rules = session.query(ArticleRule).filter(ArticleRule.enable == 1).all() session.close() runner = CrawlerRunner(settings) for rule in rules: # spider = ArticleSpider(rule) # instantiate every spider using rule # stop reactor when spider closes # runner.signals.connect(spider_closing, signal=signals.spider_closed) runner.crawl(ArticleSpider, rule=rule) d = runner.join() d.addBoth(lambda _: reactor.stop()) # blocks process so always keep as the last statement reactor.run() logging.info('all finished.')
from twisted.internet import reactor import scrapy from scrapy.utils.project import get_project_settings from scrapy.crawler import CrawlerRunner from scrapy.utils.log import configure_logging from jobs.spiders.bulldogjob import BulldogjobSpider settings = get_project_settings() configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) runner = CrawlerRunner(settings) d = runner.crawl(BulldogjobSpider) d.addBoth(lambda _: reactor.stop()) reactor.run()
# -*- coding: utf-8 -*- from scrapy.crawler import CrawlerRunner from scrapy.utils.log import configure_logging from twisted.internet import reactor from cnblogSpider.spiders.cnblogs import CnblogsSpider from cnblogSpider.spiders.douban import DoubanSpider from cnblogSpider.spiders.zufang import ZufangSpider from scrapy.utils.project import get_project_settings if __name__ == '__main__': configure_logging() runner = CrawlerRunner(get_project_settings()) runner.crawl(ZufangSpider) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def prelaunch(user, **kwargs): if ('json_parser' in kwargs) and kwargs['json_parser']: json_parser = kwargs['json_parser'] else: json_parser = False if ('csv_parser' in kwargs) and kwargs['csv_parser']: csv_parser = kwargs['csv_parser'] else: csv_parser = False if ('start_time' in kwargs) and kwargs['start_time']: start_time = kwargs['start_time'] else: start_time = '' settings = get_project_settings() if (json_parser == True) and (csv_parser == True): settings.set( "FEEDS", { "%s_posts.json" % user: { 'format': 'json', 'encoding': 'utf8' }, pathlib.Path('%s_posts.csv' % user): { 'format': 'csv' }, }) elif ((json_parser == True) and (csv_parser == False)) or ((json_parser == False) and (csv_parser == False)): settings.set("FEEDS", { "%s_posts.json" % user: { "format": "json", 'encoding': 'utf8' }, }) elif (csv_parser == True) and (json_parser == False): settings.set("FEEDS", { pathlib.Path('%s_posts.csv' % user): { 'format': 'csv' }, }) # configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s' }) # logging.getLogger('scrapy').propagate = False runner = CrawlerRunner(settings) d = runner.crawl(ProfilSpider, profil=user, time=start_time) d.addBoth(lambda _: reactor.stop()) reactor.run() # the script will block here until the crawling is finished if (json_parser == False) and (csv_parser == False): try: with open("%s_posts.json" % user, encoding='UTF-8') as f: data = json.load(f) # os.remove("%s_posts.json" % user) return data except: return 'cannot parse data'
dont_filter=True ) def parse(self, response): # load IP which made the request from json string ip = loads(response.text)['origin'] yield {'ip': ip} def get_settings() -> Settings: settings = Settings() # Enter your package credentials here! settings.set('PROXYLAND', { 'username': '******', 'password': '******' }) # enable ProxylandMiddleware and HttpProxyMiddleware settings.set('DOWNLOADER_MIDDLEWARES', { 'middleware.ProxylandMiddleware': 350, 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400, }) return settings if __name__ == '__main__': # routine to run scrapy from a script # see: https://docs.scrapy.org/en/latest/topics/practices.html#run-scrapy-from-a-script settings = get_settings() runner = CrawlerRunner(settings) d = runner.crawl(IpSpider) d.addBoth(lambda _: reactor.stop()) reactor.run()
class ScrAPI(Flask): def __init__(self, import_name=__package__, **kwargs): super(ScrAPI, self).__init__(import_name, **kwargs) configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) self._init_url_rules() self.process = CrawlerRunner(get_project_settings()) self.tp = reactor.getThreadPool() self.database = DatabaseConnector(DATABASE_URL) self.response_meta = {"meta": {"project": "WSF Web Scraper"}} def __del__(self): self.database._close_all_spiders() self.database.cursor.close() self.database.connection.close() def run(self, host=None, port=None, debug=None, **options): super(ScrAPI, self).run(host, port, debug, **options) def _get_meta_response(self, res): res.update(self.response_meta) return res def _init_url_rules(self): """Attach the endpoints to run spiders and list the spiders that are available in the API """ self.add_url_rule( '/spiders', view_func=self.list_spiders, methods=['GET'], ) self.add_url_rule( '/spiders', view_func=self.run_spider, methods=['POST'], ) self.add_url_rule( '/spiders/<int:spider_id>', view_func=self.close_spider, methods=['DELETE'], ) self.add_url_rule( '/database', view_func=self.import_db, methods=['POST'], ) self.add_url_rule( '/database', view_func=self.export_db, methods=['GET'], ) self.add_url_rule( '/database', view_func=self.clear_scraps, methods=['DELETE'], ) self.add_url_rule( '/crawls', view_func=self.list_crawls, methods=['GET'], ) self.add_url_rule( '/crawls', view_func=self.stop, methods=['DELETE'], ) self.add_url_rule( '/', view_func=self.home, methods=['GET'], ) def home(self): routes = [{ "url": "/spiders", "method": "GET" }, { "url": "/spiders", "method": "POST", "arguments": { "spider": "name of the spider to run" } }, { "url": "/spiders/:spider_id", "method": "DELETE", "arguments": { "spider_id": "uuid of the spider to close" } }, { "url": "/crawls", "method": "GET" }, { "url": "/crawls", "method": "DELETE" }, { "url": "/database", "method": "GET" }, { "url": "/database", "method": "POST", "arguments": { "file": "json file containing the database dump" } }, { "url": "/database", "method": "DELETE" }] result = self._get_meta_response({"routes": routes}) return jsonify(result), 200 def list_spiders(self): spiders = self.process.spider_loader.list() return jsonify({"spiders": spiders, "status": "success"}), 200 def run_spider(self): post_data = request.get_json() spider = post_data.get('spider') if spider == 'who_iris': spider = who_iris_spider.WhoIrisSpider() elif spider == 'nice': spider = nice_spider.NiceSpider() else: return '', 404 spider_id = str(uuid.uuid4()) self.process.crawl(spider, uuid=spider_id) crawl = self.process.join() self.database.insert_spider(spider.name, spider_id) crawl.addBoth(self.on_success) return jsonify({ "data": { "status": "running", "spider": spider.name, "_id": spider_id } }), 200 def on_success(self, data): self.database._close_all_spiders() def close_spider(self, spider_id): for crawl in self.process.crawlers: if crawl.spider.uuid == uuid: crawl.stop() return jsonify( {"data": { "status": "success", "_id": spider_id }}), 200 return '', 400 def list_crawls(self): crawls = self.process.crawlers running_spiders = [] for crawl in crawls: start_time = crawl.stats.get_value('start_time') spider = { '_id': crawl.spider.uuid, 'spider': crawl.spider.name, 'start_time': start_time, 'total_time': str(datetime.now() - start_time), 'item_dropped': crawl.stats.get_value('item_dropped_count'), 'item_scraped': crawl.stats.get_value('item_scraped_count'), 'total_requests': crawl.stats.get_value('downloader/request_count'), } running_spiders.append(spider) finished_spiders = [] for spider in self.database.get_finished_crawls(): finished_spiders.append(spider) spiders = {"crawling": running_spiders, "finished": finished_spiders} return jsonify({"data": {"spiders": spiders}}), 200 def stop(self): self.process.stop() return jsonify({"data": {"status": "success"}}), 200 def export_db(self): articles_rows = self.database.get_articles() articles = [] now = datetime.now() for title, file_hash, url in articles_rows: articles.append({ 'title': title, 'file_hash': file_hash, 'url': url, }) json_file = tempfile.NamedTemporaryFile() json_file.write(json.dumps(articles).encode('utf-8')) json_file.seek(0) return send_file(json_file, mimetype='application/json', as_attachment=True, attachment_filename=f'export-{now}.json') def import_db(self): if request.files: data_file = request.files.get('file') if data_file.filename == '': return 'Filename must not be blank', 400 if data_file.content_type == 'application/json': json_file = data_file.stream.read() else: return 'File format is not json.', 415 try: json_dict = json.loads(json_file) for article in json_dict: self.database.insert_article(article.get('title'), article.get('file_hash'), article.get('url')) return '', 201 except Exception as e: result = {"errors": [str(e)]} return jsonify(result), 400 else: return 'No JSON file in request', 400 def clear_scraps(self): try: self.database.reset_scraped() return '', 204 except Exception as e: return str(e), 500
def Crawl_job(self, URL, Next): Runner = CrawlerRunner(settings=self.spider_settings) return Runner.crawl(eval(self.SpiderName), profile_url=URL, next=Next)
from scrapy.crawler import CrawlerRunner from twisted.internet import reactor from scrapy.utils.project import get_project_settings from weibo_scrapy.spiders.weibo import WeiboSpider from weibo_scrapy.spiders.weibo_comment import CommentSpider from weibo_scrapy.spiders.weibo_repost import RepostSpider from scrapy.utils.log import configure_logging # 传入两个参数 {type} {line} 例如:python crawler_run.py weibo 1,重庆发布,1988438334,20,False@_@ # type取值有:weibo | repost | comment if __name__ == '__main__': configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) runner = CrawlerRunner(get_project_settings()) if sys.argv[1] == 'weibo': d = runner.crawl(WeiboSpider, sys.argv[2]) d.addBoth(lambda _: reactor.stop()) elif sys.argv[1] == 'repost': d = runner.crawl(RepostSpider, sys.argv[2]) d.addBoth(lambda _: reactor.stop()) elif sys.argv[1] == 'comment': d = runner.crawl(CommentSpider, sys.argv[2]) d.addBoth(lambda _: reactor.stop()) reactor.run() # 基于用户id搜索: 第一位:type = 1 第二位:用户名 第三位:用户id 第四位:页数 第五位:是否要图片 # 例子: 1,重庆发布,1988438334,3,True # execute(("scrapy crawl weibo -a line=1,重庆发布,1988438334,20,False@_@").split(" ")) # 基于关键词搜索: 第一位:type = 2 第二位:关键词 第三位:1-综合 60-热门 61-实时 第四位:页数 第五位:是否要图片
'//*[@id="ficha_producto_int"]/h1/text()').extract_first() ml_item['precio'] = response.xpath( '//*[@id="PriceProduct"]/text()[not(parent::span[@class="SignoPriceProduct"])and normalize-space()]' ).extract() ml_item['link'] = response.xpath( '//*[@id="HeaderInfoMiddlePerfil_Box_2"]/a/@href').extract() ml_item['sku'] = response.xpath( '//*[@id="imagen_producto"]/div[@class="dvInfoGral"][1]/span[@class="txValueInfoGral"]/text()' ).extract() ml_item['plataforma'] = response.xpath( '//*[@id="imagen_producto"]/div[@class="dvInfoGral"][2]/span[@class="txValueInfoGral"]/text()' ).extract() self.item_count += 1 if self.item_count > 100: raise CloseSpider('item_exceeded') yield ml_item configure_logging() runner = CrawlerRunner() runner.crawl(SVGSpiderPS4) runner.crawl(SVGSpiderPS3) runner.crawl(SVGSpiderXONE) runner.crawl(SVGSpiderPSVITA) runner.crawl(SVGSpiderNSWI) runner.crawl(SVGSpiderWIIU) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def test_start_requests_lazyness(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler( BrokenStartRequestsSpider) yield crawler.crawl(mockserver=self.mockserver)
class CrawlTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self.runner = CrawlerRunner() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_follow_all(self): crawler = self.runner.create_crawler(FollowAllSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertEqual(len(crawler.spider.urls_visited), 11) # 10 + start_url @defer.inlineCallbacks def test_fixed_delay(self): yield self._test_delay(total=3, delay=0.2) @defer.inlineCallbacks def test_randomized_delay(self): yield self._test_delay(total=3, delay=0.1, randomize=True) @defer.inlineCallbacks def _test_delay(self, total, delay, randomize=False): crawl_kwargs = dict( maxlatency=delay * 2, mockserver=self.mockserver, total=total, ) tolerance = (1 - (0.6 if randomize else 0.2)) settings = { "DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize } crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider) yield crawler.crawl(**crawl_kwargs) times = crawler.spider.times total_time = times[-1] - times[0] average = total_time / (len(times) - 1) self.assertTrue(average > delay * tolerance, "download delay too small: %s" % average) # Ensure that the same test parameters would cause a failure if no # download delay is set. Otherwise, it means we are using a combination # of ``total`` and ``delay`` values that are too small for the test # code above to have any meaning. settings["DOWNLOAD_DELAY"] = 0 crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider) yield crawler.crawl(**crawl_kwargs) times = crawler.spider.times total_time = times[-1] - times[0] average = total_time / (len(times) - 1) self.assertFalse(average > delay / tolerance, "test total or delay values are too small") @defer.inlineCallbacks def test_timeout_success(self): crawler = self.runner.create_crawler(DelaySpider) yield crawler.crawl(n=0.5, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 > 0) self.assertTrue(crawler.spider.t2 > crawler.spider.t1) @defer.inlineCallbacks def test_timeout_failure(self): crawler = CrawlerRunner({ "DOWNLOAD_TIMEOUT": 0.35 }).create_crawler(DelaySpider) yield crawler.crawl(n=0.5, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) # server hangs after receiving response headers yield crawler.crawl(n=0.5, b=1, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) @defer.inlineCallbacks def test_retry_503(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/status?n=503"), mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_conn_failed(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("http://localhost:65432/status?n=503", mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_dns_error(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: # try to fetch the homepage of a non-existent domain yield crawler.crawl("http://dns.resolution.invalid./", mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_start_requests_bug_before_yield(self): with LogCapture('scrapy', level=logging.ERROR) as l: crawler = self.runner.create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(fail_before_yield=1, mockserver=self.mockserver) self.assertEqual(len(l.records), 1) record = l.records[0] self.assertIsNotNone(record.exc_info) self.assertIs(record.exc_info[0], ZeroDivisionError) @defer.inlineCallbacks def test_start_requests_bug_yielding(self): with LogCapture('scrapy', level=logging.ERROR) as l: crawler = self.runner.create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(fail_yielding=1, mockserver=self.mockserver) self.assertEqual(len(l.records), 1) record = l.records[0] self.assertIsNotNone(record.exc_info) self.assertIs(record.exc_info[0], ZeroDivisionError) @defer.inlineCallbacks def test_start_requests_lazyness(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler( BrokenStartRequestsSpider) yield crawler.crawl(mockserver=self.mockserver) #self.assertTrue(False, crawler.spider.seedsseen) #self.assertTrue(crawler.spider.seedsseen.index(None) < crawler.spider.seedsseen.index(99), # crawler.spider.seedsseen) @defer.inlineCallbacks def test_start_requests_dupes(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler( DuplicateStartRequestsSpider) yield crawler.crawl(dont_filter=True, distinct_urls=2, dupe_factor=3, mockserver=self.mockserver) self.assertEqual(crawler.spider.visited, 6) yield crawler.crawl(dont_filter=False, distinct_urls=3, dupe_factor=4, mockserver=self.mockserver) self.assertEqual(crawler.spider.visited, 3) @defer.inlineCallbacks def test_unbounded_response(self): # Completeness of responses without Content-Length or Transfer-Encoding # can not be determined, we treat them as valid but flagged as "partial" from urllib.parse import urlencode query = urlencode({ 'raw': '''\ HTTP/1.1 200 OK Server: Apache-Coyote/1.1 X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0 Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/ Pragma: no-cache Expires: Thu, 01 Jan 1970 00:00:00 GMT Cache-Control: no-cache Cache-Control: no-store Content-Type: text/html;charset=UTF-8 Content-Language: en Date: Tue, 27 Aug 2013 13:05:05 GMT Connection: close foo body with multiples lines ''' }) crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/raw?{0}".format(query)), mockserver=self.mockserver) self.assertEqual(str(l).count("Got response 200"), 1) @defer.inlineCallbacks def test_retry_conn_lost(self): # connection lost after receiving data crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/drop?abort=0"), mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_conn_aborted(self): # connection lost before receiving data crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/drop?abort=1"), mockserver=self.mockserver) self._assert_retried(l) def _assert_retried(self, log): self.assertEqual(str(log).count("Retrying"), 2) self.assertEqual(str(log).count("Gave up retrying"), 1) @defer.inlineCallbacks def test_referer_header(self): """Referer header is set by RefererMiddleware unless it is already set""" req0 = Request(self.mockserver.url('/echo?headers=1&body=0'), dont_filter=1) req1 = req0.replace() req2 = req0.replace(headers={'Referer': None}) req3 = req0.replace(headers={'Referer': 'http://example.com'}) req0.meta['next'] = req1 req1.meta['next'] = req2 req2.meta['next'] = req3 crawler = self.runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=req0, mockserver=self.mockserver) # basic asserts in case of weird communication errors self.assertIn('responses', crawler.spider.meta) self.assertNotIn('failures', crawler.spider.meta) # start requests doesn't set Referer header echo0 = json.loads(to_unicode( crawler.spider.meta['responses'][2].body)) self.assertNotIn('Referer', echo0['headers']) # following request sets Referer to start request url echo1 = json.loads(to_unicode( crawler.spider.meta['responses'][1].body)) self.assertEqual(echo1['headers'].get('Referer'), [req0.url]) # next request avoids Referer header echo2 = json.loads(to_unicode( crawler.spider.meta['responses'][2].body)) self.assertNotIn('Referer', echo2['headers']) # last request explicitly sets a Referer header echo3 = json.loads(to_unicode( crawler.spider.meta['responses'][3].body)) self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com']) @defer.inlineCallbacks def test_engine_status(self): from scrapy.utils.engine import get_engine_status est = [] def cb(response): est.append(get_engine_status(crawler.engine)) crawler = self.runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=self.mockserver.url('/'), callback_func=cb, mockserver=self.mockserver) self.assertEqual(len(est), 1, est) s = dict(est[0]) self.assertEqual(s['engine.spider.name'], crawler.spider.name) self.assertEqual(s['len(engine.scraper.slot.active)'], 1) @defer.inlineCallbacks def test_graceful_crawl_error_handling(self): """ Test whether errors happening anywhere in Crawler.crawl() are properly reported (and not somehow swallowed) after a graceful engine shutdown. The errors should not come from within Scrapy's core but from within spiders/middlewares/etc., e.g. raised in Spider.start_requests(), SpiderMiddleware.process_start_requests(), etc. """ class TestError(Exception): pass class FaultySpider(SimpleSpider): def start_requests(self): raise TestError crawler = self.runner.create_crawler(FaultySpider) yield self.assertFailure(crawler.crawl(mockserver=self.mockserver), TestError) self.assertFalse(crawler.crawling) @defer.inlineCallbacks def test_open_spider_error_on_faulty_pipeline(self): settings = { "ITEM_PIPELINES": { "tests.pipelines.ZeroDivisionErrorPipeline": 300, } } crawler = CrawlerRunner(settings).create_crawler(SimpleSpider) yield self.assertFailure( self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver), ZeroDivisionError) self.assertFalse(crawler.crawling) @defer.inlineCallbacks def test_crawlerrunner_accepts_crawler(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as log: yield self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.assertIn("Got response 200", str(log)) @defer.inlineCallbacks def test_crawl_multiple(self): self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=503"), mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self._assert_retried(log) self.assertIn("Got response 200", str(log)) @defer.inlineCallbacks def test_crawlspider_with_errback(self): self.runner.crawl(CrawlSpiderWithErrback, mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self.assertIn("[callback] status 200", str(log)) self.assertIn("[callback] status 201", str(log)) self.assertIn("[errback] status 404", str(log)) self.assertIn("[errback] status 500", str(log)) @defer.inlineCallbacks def test_async_def_parse(self): self.runner.crawl(AsyncDefSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self.assertIn("Got response 200", str(log)) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse(self): runner = CrawlerRunner({ "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor" }) runner.crawl(AsyncDefAsyncioSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) with LogCapture() as log: yield runner.join() self.assertIn("Got response 200", str(log)) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse_items_list(self): items = [] def _on_item_scraped(item): items.append(item) crawler = self.runner.create_crawler(AsyncDefAsyncioReturnSpider) crawler.signals.connect(_on_item_scraped, signals.item_scraped) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.assertIn("Got response 200", str(log)) self.assertIn({'id': 1}, items) self.assertIn({'id': 2}, items) @mark.skipif(sys.version_info < (3, 6), reason="Async generators require Python 3.6 or higher") @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncgen_parse(self): from tests.py36._test_crawl import AsyncDefAsyncioGenSpider crawler = self.runner.create_crawler(AsyncDefAsyncioGenSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.assertIn("Got response 200", str(log)) itemcount = crawler.stats.get_value('item_scraped_count') self.assertEqual(itemcount, 1) @mark.skipif(sys.version_info < (3, 6), reason="Async generators require Python 3.6 or higher") @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncgen_parse_loop(self): items = [] def _on_item_scraped(item): items.append(item) from tests.py36._test_crawl import AsyncDefAsyncioGenLoopSpider crawler = self.runner.create_crawler(AsyncDefAsyncioGenLoopSpider) crawler.signals.connect(_on_item_scraped, signals.item_scraped) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.assertIn("Got response 200", str(log)) itemcount = crawler.stats.get_value('item_scraped_count') self.assertEqual(itemcount, 10) for i in range(10): self.assertIn({'foo': i}, items) @mark.skipif(sys.version_info < (3, 6), reason="Async generators require Python 3.6 or higher") @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncgen_parse_complex(self): items = [] def _on_item_scraped(item): items.append(item) from tests.py36._test_crawl import AsyncDefAsyncioGenComplexSpider crawler = self.runner.create_crawler(AsyncDefAsyncioGenComplexSpider) crawler.signals.connect(_on_item_scraped, signals.item_scraped) yield crawler.crawl(mockserver=self.mockserver) itemcount = crawler.stats.get_value('item_scraped_count') self.assertEqual(itemcount, 156) # some random items for i in [1, 4, 21, 22, 207, 311]: self.assertIn({'index': i}, items) for i in [10, 30, 122]: self.assertIn({'index2': i}, items) @mark.only_asyncio() @defer.inlineCallbacks def test_async_def_asyncio_parse_reqs_list(self): crawler = self.runner.create_crawler(AsyncDefAsyncioReqsReturnSpider) with LogCapture() as log: yield crawler.crawl(self.mockserver.url("/status?n=200"), mockserver=self.mockserver) for req_id in range(3): self.assertIn("Got response 200, req_id %d" % req_id, str(log)) @defer.inlineCallbacks def test_response_ssl_certificate_none(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url("/echo?body=test", is_secure=False) yield crawler.crawl(seed=url, mockserver=self.mockserver) self.assertIsNone(crawler.spider.meta['responses'][0].certificate) @defer.inlineCallbacks def test_response_ssl_certificate(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url("/echo?body=test", is_secure=True) yield crawler.crawl(seed=url, mockserver=self.mockserver) cert = crawler.spider.meta['responses'][0].certificate self.assertIsInstance(cert, Certificate) self.assertEqual(cert.getSubject().commonName, b"localhost") self.assertEqual(cert.getIssuer().commonName, b"localhost") @mark.xfail( reason="Responses with no body return early and contain no certificate" ) @defer.inlineCallbacks def test_response_ssl_certificate_empty_response(self): crawler = self.runner.create_crawler(SingleRequestSpider) url = self.mockserver.url("/status?n=200", is_secure=True) yield crawler.crawl(seed=url, mockserver=self.mockserver) cert = crawler.spider.meta['responses'][0].certificate self.assertIsInstance(cert, Certificate) self.assertEqual(cert.getSubject().commonName, b"localhost") self.assertEqual(cert.getIssuer().commonName, b"localhost")
from twisted.internet import reactor from scrapy.crawler import CrawlerRunner from scrapy.utils.log import configure_logging from sina.spiders import weibo_spider configure_logging() runner = CrawlerRunner() runner.crawl(weibo_spider) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
def run_spider(settings, spider): runner = CrawlerRunner(settings) deferred = runner.crawl(spider) return deferred
class ScheduleCrawlerRunner: @staticmethod def interval_to_app_task(interval='daily'): now = datetime.fromtimestamp( os.getenv('NOW', time.time()), pytz.timezone(os.getenv('APP_TIMEZONE', 'Asia/Chongqing')), ) # %Y-%m-%d %H:%M:%S %z # applogger.debug('Interval %s convert to APP_TASK at %s', interval, now) formats = { 'debug': lambda: now.strftime('%Y%m%d_%H%M'), 'hourly': lambda: now.strftime('%Y%m%d_%H'), 'daily': lambda: now.strftime('%Y%m%d'), 'weekly': lambda: now.strftime('%YW%U'), 'monthly': lambda: now.strftime('%Y%m'), } return formats[interval]() def __init__(self, spider_name: str): self.settings = get_project_settings() self.crawler = CrawlerRunner(self.settings) self.round = 0 self.spider_name = spider_name def get_spider_class(self, spider_name: str): spider_module = importlib.import_module('evascrapy.spiders.' + spider_name + '_spider') spider_class = None for name, spider_member in inspect.getmembers(spider_module): if inspect.isclass(spider_member) \ and issubclass(spider_member, CrawlSpider) \ and hasattr(spider_member, 'name') \ and spider_member.name: spider_class = spider_member break return spider_class def schedule(self): scheduler = TwistedScheduler( {'apscheduler.timezone': self.settings.get('APP_TIMEZONE')}) # TODO: use random interval switch = { 'debug': lambda: scheduler.add_job(self.run_crawler, 'interval', seconds=3), 'hourly': lambda: scheduler.add_job( self.run_crawler, 'interval', seconds=3600), 'daily': lambda: scheduler.add_job( self.run_crawler, 'interval', seconds=86400), 'weekly': lambda: scheduler.add_job( self.run_crawler, 'interval', seconds=86400 * 7), 'monthly': lambda: scheduler.add_job( self.run_crawler, 'interval', seconds=86400 * 30), } switch[self.settings.get('APP_CRAWL_INTERVAL')]() scheduler.start() def run_crawler(self): spider_class = self.get_spider_class(self.spider_name) if os.getenv('APP_DISTRIBUTED'): redis = get_redis(url=self.crawler.settings.get('REDIS_URL')) if len(list(self.crawler.crawlers)) < 1: self.crawler.settings.set( 'APP_TASK', ScheduleCrawlerRunner.interval_to_app_task( self.crawler.settings.get('APP_STORAGE_SHUFFLE_INTERVAL'))) if os.getenv('APP_DISTRIBUTED'): if redis.zcount(spider_class.name + ':requests', 0, 100) < 1: for start_url in spider_class.start_urls: redis.sadd(spider_class.name + ':start_urls', start_url) else: self.crawler.settings.set( 'APP_TASK', redis.get(spider_class.name + ':app_task').decode('utf-8')) logger.info( '[SPIDER.%s.%s.DIS_%s.ROUND_%s] started, APP_CRAWL_INTERVAL: %s, APP_STORAGE_SHUFFLE_INTERVAL: %s', spider_class.name, self.crawler.settings.get('APP_TASK'), os.getenv('APP_DISTRIBUTED'), self.round, self.crawler.settings.get('APP_CRAWL_INTERVAL'), self.crawler.settings.get('APP_STORAGE_SHUFFLE_INTERVAL')) self.crawler.crawl(spider_class) if os.getenv('APP_DISTRIBUTED'): redis.set(spider_class.name + ':app_task', self.crawler.settings.get('APP_TASK')) self.round += 1 else: logger.info('NEW ROUND SKIPPED BY [SPIDER.%s.%s.DIS_%s.ROUND_%s]', spider_class.name, self.crawler.settings.get('APP_TASK'), os.getenv('APP_DISTRIBUTED'), self.round) def start(self): reactor.run()
# get current working directory cwd = str(pathlib.Path().absolute()) # set path in which to store images settings.set('IMAGES_STORE', cwd + '\\') settings.set( 'IMAGE_URL_FIELDS', { 'white': { 'name_field': 'title', 'sub_folder': 'white', 'path_field': 'white_path', }, 'black': { 'name_field': 'title', 'sub_folder': 'black', 'path_field': 'black_path', } }) # enable the pipeline settings.set('ITEM_PIPELINES', {'pipeline.ImageNamePipeline': 200}) return settings if __name__ == '__main__': # routine to run scrapy from a script # see: https://docs.scrapy.org/en/latest/topics/practices.html#run-scrapy-from-a-script settings = get_settings() runner = CrawlerRunner(settings) d = runner.crawl(CatSpider) d.addBoth(lambda _: reactor.stop()) reactor.run()
def run(): configure_logging() runner = CrawlerRunner(get_project_settings()) d = runner.crawl(HMSpider) d.addBoth(lambda _: reactor.stop()) reactor.run() # the script will block here until the crawling is finished
class QuotesSpider(scrapy.Spider): name = "quotes" def start_requests(self): url = 'http://quotes.toscrape.com/' yield scrapy.Request(url, self.parse) def parse(self, response): for quote in response.css('div.quote'): yield { 'text': quote.css('span.text::text').get(), 'author': quote.css('small.author::text').get(), } next_page = response.css('li.next a::attr(href)').get() if next_page is not None: yield response.follow(next_page, self.parse) configure_logging() runner = CrawlerRunner() runner.crawl(AuthorSpider) runner.crawl(QuotesSpider) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run( ) # the script will block here until all crawling jobs are finished
import os import sys, os.path, io import time sys.path.append('D:/home/python364x86/Lib/site-packages') from twisted.internet import reactor import scrapy from scrapy.crawler import CrawlerRunner from scrapy.utils.log import configure_logging from hotelSpider import HotelSpider urlList = [] urlList = sys.argv[1].split(",") limit = int(sys.argv[2]) newUrlList = [] for url in urlList: if (len(url) > 1): newUrlList.append(url) configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) runner = CrawlerRunner() d = runner.crawl(HotelSpider, newUrlList, limit) d.addBoth(lambda _: reactor.stop()) reactor.run()
#! /usr/bin/env python # -*- coding:UTF-8 -*- # 同一个进程同时运行多个爬虫 from twisted.internet import reactor import scrapy from scrapy.crawler import CrawlerRunner from scrapy.utils.log import configure_logging from scrapy.utils.project import get_project_settings configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) runner = CrawlerRunner(get_project_settings()) runner.crawl('quotes') runner.crawl('author') d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
if case.check(task, maps): spiders.append(case.spider) break else: # crawler_logger.warning('spider task {} is an invalid task, the allowed tasks are {}'.format( # task, list(maps.keys()))) pass if not spiders: # crawler_logger.warning('no spider starts up, please check your task input') return settings = get_project_settings() configure_logging(settings) runner = CrawlerRunner(settings) for spider in spiders: runner.crawl(spider) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run() @click.command() @click.option('--usage', default='https', help='Usage of squid') @click.option('--interval', default=TTL_VALIDATED_RESOURCE, help='Updating frenquency of squid conf.') def squid_conf_update(usage, interval): """Timertask for updating proxies for squid config file""" # client_logger.info('the updating task is starting...') client = SquidClient(usage) client.update_conf()
# coding=utf-8 from scrapy.crawler import CrawlerRunner from scrapy.utils.log import configure_logging from scrapy.utils.project import get_project_settings from twisted.internet import reactor from engine.spiders.BaiduSpider import BaiduSpider from engine.spiders.SpidersList import AssistRedisSpider # configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) # runner = CrawlerRunner(get_project_settings()) # # d = runner.crawl(TopSpider) # d.addBoth(lambda _: reactor.stop()) # reactor.run() configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) runner = CrawlerRunner(get_project_settings()) d = runner.crawl(BaiduSpider) d.addBoth(lambda _: reactor.stop()) reactor.run() # configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) # runner = CrawlerRunner(get_project_settings()) # d = runner.crawl(RedisSpider) # d.addBoth(lambda _: reactor.stop()) # reactor.run()
# -*-coding:utf-8-*- ''' Created on 2015年8月30日 @author: yx ''' from twisted.internet import reactor, defer from scrapy.crawler import CrawlerRunner from scrapy.utils.project import get_project_settings from scrapy.crawler import CrawlerProcess import multiprocessing from get_url import get_url import time if __name__ == '__main__': runner = CrawlerRunner(get_project_settings()) d = runner.crawl('comment_scrapy') d.addBoth(lambda _: reactor.stop()) reactor.run() # the script will block here until the crawling is finished print "get once!!" get_url() print "update url!!" + ' at ' + time.strftime("%Y-%m-%d %H:%M", time.localtime())
def search(self): print(self.user) print(__name__) #print(user, '3') #process = CrawlerProcess(get_project_settings()) #process.crawl('JumpReport') #process.start() #process.stop() #process.put() # 脚本执行爬虫代码 runner = CrawlerRunner(get_project_settings()) #def search(runner, keyword): # return runner.crawl(JumpReport, keyword) #runner = CrawlerProcess() #dfs = set() print('a') runner.crawl('JumpReport', user=self.user) print(self.user) d = runner.join() #dfs.add(d) #defer.DeferredList(dfs).addBoth(lambda _: reactor.stop()) d.addBoth(lambda _: reactor.stop()) #search(runner, "abcd") #search(runner, "beat") #runner.start() reactor.run() # 阻塞运行爬虫 print("complete") # runner = CrawlerRunner(get_project_settings()) # dfs = set() # for domain in range(2): # d = runner.crawl('JumpReport') # dfs.add(d) # # defer.DeferredList(dfs).addBoth(lambda _: reactor.stop()) # reactor.run() # the script will block here until all crawling jobs are finished # runner = CrawlerRunner(get_project_settings()) # # @defer.inlineCallbacks # def crawl(): # for domain in range(2): # yield runner.crawl('JumpReport') # reactor.stop() # # crawl() # reactor.run() # the script will block here until the last crawl call is finished # settings = Settings({'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}) # runner = CrawlerRunner(settings) # # d = runner.crawl(JumpReport) # d.addBoth(lambda _: reactor.stop()) # reactor.run() # the script will block here until the crawling is finished # runner = CrawlerProcess(get_project_settings()) # runner.crawl(JumpReport) # runner.start() name = self.qle.text() db = db_handle() with db as con: sql = "select * from player where name = '{}' order by update_time".format( name) con.execute(sql) player = con.fetchone() if player: id, name, win, match_count, strength, level, update_time, rank = player text = "角色名: {}\n胜场: {}\n总场数: {}\n团分: {}\n团分排行: {}\n等级: {}\n更新时间: {}".format( name, win, match_count, strength, rank, level, update_time) self.txt.setText(text) sql = "select * from player_data where name = '{}' order by date".format( name) con.execute(sql) player_data = con.fetchall() a = "" for data in player_data: a += str(data) a += "\n" self.battle.setText(str(a)) sql = "select * from game_data order by match_id desc" con.execute(sql) game_data = con.fetchall() a = "" l = 0 self.battle_table.setRowCount(len(game_data)) for data in game_data: a += str(data[1:]) print(type(data)) for i in range(self.battle_table.columnCount()): item = QTableWidgetItem(str(data[i + 1])) # 设置填入数据的排列位置(左右居中| 上下居中) item.setTextAlignment(Qt.AlignHCenter | Qt.AlignVCenter) self.battle_table.setItem(l, i, item) a += "\n" self.player_status.setText(str(a)) l += 1
class CrawlTestCase(TestCase): def setUp(self): self.mockserver = MockServer() self.mockserver.__enter__() self.runner = CrawlerRunner() def tearDown(self): self.mockserver.__exit__(None, None, None) @defer.inlineCallbacks def test_follow_all(self): crawler = self.runner.create_crawler(FollowAllSpider) yield crawler.crawl(mockserver=self.mockserver) self.assertEqual(len(crawler.spider.urls_visited), 11) # 10 + start_url @defer.inlineCallbacks def test_delay(self): # short to long delays yield self._test_delay(0.2, False) yield self._test_delay(1, False) # randoms yield self._test_delay(0.2, True) yield self._test_delay(1, True) @defer.inlineCallbacks def _test_delay(self, delay, randomize): settings = { "DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize } crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider) yield crawler.crawl(maxlatency=delay * 2, mockserver=self.mockserver) t = crawler.spider.times totaltime = t[-1] - t[0] avgd = totaltime / (len(t) - 1) tolerance = 0.6 if randomize else 0.2 self.assertTrue(avgd > delay * (1 - tolerance), "download delay too small: %s" % avgd) @defer.inlineCallbacks def test_timeout_success(self): crawler = self.runner.create_crawler(DelaySpider) yield crawler.crawl(n=0.5, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 > 0) self.assertTrue(crawler.spider.t2 > crawler.spider.t1) @defer.inlineCallbacks def test_timeout_failure(self): crawler = CrawlerRunner({ "DOWNLOAD_TIMEOUT": 0.35 }).create_crawler(DelaySpider) yield crawler.crawl(n=0.5, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) # server hangs after receiving response headers yield crawler.crawl(n=0.5, b=1, mockserver=self.mockserver) self.assertTrue(crawler.spider.t1 > 0) self.assertTrue(crawler.spider.t2 == 0) self.assertTrue(crawler.spider.t2_err > 0) self.assertTrue(crawler.spider.t2_err > crawler.spider.t1) @defer.inlineCallbacks def test_retry_503(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/status?n=503"), mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_conn_failed(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl("http://localhost:65432/status?n=503", mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_dns_error(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: # try to fetch the homepage of a non-existent domain yield crawler.crawl("http://dns.resolution.invalid./", mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_start_requests_bug_before_yield(self): with LogCapture('scrapy', level=logging.ERROR) as l: crawler = self.runner.create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(fail_before_yield=1, mockserver=self.mockserver) self.assertEqual(len(l.records), 1) record = l.records[0] self.assertIsNotNone(record.exc_info) self.assertIs(record.exc_info[0], ZeroDivisionError) @defer.inlineCallbacks def test_start_requests_bug_yielding(self): with LogCapture('scrapy', level=logging.ERROR) as l: crawler = self.runner.create_crawler(BrokenStartRequestsSpider) yield crawler.crawl(fail_yielding=1, mockserver=self.mockserver) self.assertEqual(len(l.records), 1) record = l.records[0] self.assertIsNotNone(record.exc_info) self.assertIs(record.exc_info[0], ZeroDivisionError) @defer.inlineCallbacks def test_start_requests_lazyness(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler( BrokenStartRequestsSpider) yield crawler.crawl(mockserver=self.mockserver) #self.assertTrue(False, crawler.spider.seedsseen) #self.assertTrue(crawler.spider.seedsseen.index(None) < crawler.spider.seedsseen.index(99), # crawler.spider.seedsseen) @defer.inlineCallbacks def test_start_requests_dupes(self): settings = {"CONCURRENT_REQUESTS": 1} crawler = CrawlerRunner(settings).create_crawler( DuplicateStartRequestsSpider) yield crawler.crawl(dont_filter=True, distinct_urls=2, dupe_factor=3, mockserver=self.mockserver) self.assertEqual(crawler.spider.visited, 6) yield crawler.crawl(dont_filter=False, distinct_urls=3, dupe_factor=4, mockserver=self.mockserver) self.assertEqual(crawler.spider.visited, 3) @defer.inlineCallbacks def test_unbounded_response(self): # Completeness of responses without Content-Length or Transfer-Encoding # can not be determined, we treat them as valid but flagged as "partial" from six.moves.urllib.parse import urlencode query = urlencode({ 'raw': '''\ HTTP/1.1 200 OK Server: Apache-Coyote/1.1 X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0 Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/ Pragma: no-cache Expires: Thu, 01 Jan 1970 00:00:00 GMT Cache-Control: no-cache Cache-Control: no-store Content-Type: text/html;charset=UTF-8 Content-Language: en Date: Tue, 27 Aug 2013 13:05:05 GMT Connection: close foo body with multiples lines ''' }) crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/raw?{0}".format(query)), mockserver=self.mockserver) self.assertEqual(str(l).count("Got response 200"), 1) @defer.inlineCallbacks def test_retry_conn_lost(self): # connection lost after receiving data crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/drop?abort=0"), mockserver=self.mockserver) self._assert_retried(l) @defer.inlineCallbacks def test_retry_conn_aborted(self): # connection lost before receiving data crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as l: yield crawler.crawl(self.mockserver.url("/drop?abort=1"), mockserver=self.mockserver) self._assert_retried(l) def _assert_retried(self, log): self.assertEqual(str(log).count("Retrying"), 2) self.assertEqual(str(log).count("Gave up retrying"), 1) @defer.inlineCallbacks def test_referer_header(self): """Referer header is set by RefererMiddleware unless it is already set""" req0 = Request(self.mockserver.url('/echo?headers=1&body=0'), dont_filter=1) req1 = req0.replace() req2 = req0.replace(headers={'Referer': None}) req3 = req0.replace(headers={'Referer': 'http://example.com'}) req0.meta['next'] = req1 req1.meta['next'] = req2 req2.meta['next'] = req3 crawler = self.runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=req0, mockserver=self.mockserver) # basic asserts in case of weird communication errors self.assertIn('responses', crawler.spider.meta) self.assertNotIn('failures', crawler.spider.meta) # start requests doesn't set Referer header echo0 = json.loads(to_unicode( crawler.spider.meta['responses'][2].body)) self.assertNotIn('Referer', echo0['headers']) # following request sets Referer to start request url echo1 = json.loads(to_unicode( crawler.spider.meta['responses'][1].body)) self.assertEqual(echo1['headers'].get('Referer'), [req0.url]) # next request avoids Referer header echo2 = json.loads(to_unicode( crawler.spider.meta['responses'][2].body)) self.assertNotIn('Referer', echo2['headers']) # last request explicitly sets a Referer header echo3 = json.loads(to_unicode( crawler.spider.meta['responses'][3].body)) self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com']) @defer.inlineCallbacks def test_engine_status(self): from scrapy.utils.engine import get_engine_status est = [] def cb(response): est.append(get_engine_status(crawler.engine)) crawler = self.runner.create_crawler(SingleRequestSpider) yield crawler.crawl(seed=self.mockserver.url('/'), callback_func=cb, mockserver=self.mockserver) self.assertEqual(len(est), 1, est) s = dict(est[0]) self.assertEqual(s['engine.spider.name'], crawler.spider.name) self.assertEqual(s['len(engine.scraper.slot.active)'], 1) @defer.inlineCallbacks def test_graceful_crawl_error_handling(self): """ Test whether errors happening anywhere in Crawler.crawl() are properly reported (and not somehow swallowed) after a graceful engine shutdown. The errors should not come from within Scrapy's core but from within spiders/middlewares/etc., e.g. raised in Spider.start_requests(), SpiderMiddleware.process_start_requests(), etc. """ class TestError(Exception): pass class FaultySpider(SimpleSpider): def start_requests(self): raise TestError crawler = self.runner.create_crawler(FaultySpider) yield self.assertFailure(crawler.crawl(mockserver=self.mockserver), TestError) self.assertFalse(crawler.crawling) @defer.inlineCallbacks def test_open_spider_error_on_faulty_pipeline(self): settings = { "ITEM_PIPELINES": { "tests.pipelines.ZeroDivisionErrorPipeline": 300, } } crawler = CrawlerRunner(settings).create_crawler(SimpleSpider) yield self.assertFailure( self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver), ZeroDivisionError) self.assertFalse(crawler.crawling) @defer.inlineCallbacks def test_crawlerrunner_accepts_crawler(self): crawler = self.runner.create_crawler(SimpleSpider) with LogCapture() as log: yield self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.assertIn("Got response 200", str(log)) @defer.inlineCallbacks def test_crawl_multiple(self): self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver) self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=503"), mockserver=self.mockserver) with LogCapture() as log: yield self.runner.join() self._assert_retried(log) self.assertIn("Got response 200", str(log))
from SWSpider import * from SWSpider.spiders.sw_spider import * import SWSpider.settings from twisted.internet import reactor, defer from scrapy.crawler import CrawlerRunner from scrapy.utils.project import get_project_settings east_test = ['DTW', 'BOS', 'DCA', 'FLL', 'MDW'] runner = CrawlerRunner(get_project_settings()) dfs = set() f = open('east_prices', 'wb') f.write('') for x in xrange(0, len(east_test)): for y in xrange(0, len(east_test)): if x == y: continue d = runner.crawl( 'sw_spider', depCity=east_test[x], arrCity=east_test[y], x=x, y=y, filename='east_prices') dfs.add(d) defer.DeferredList(dfs).addBoth(lambda _: reactor.stop()) # the script will block here until all crawling jobs are finished reactor.run()
def run_spider(self): """ Rather than using scrapyd or executing the spider manually via scrapy, this method creates a CrawlerRunnerand runs the spider provided at construction. https://doc.scrapy.org/en/latest/topics/practices.html#run-from-script http://twistedmatrix.com/trac/wiki/FrequentlyAskedQuestions#Igetexceptions.ValueError:signalonlyworksinmainthreadwhenItrytorunmyTwistedprogramWhatswrong """ try: self.set_stage(ScraperStage.CRAWLING) self.start_time = time.time() # post debug message to slack if self.debug_slack: self.handle_slack_message(DEBUG_SLACK_CHANNEL, 'Starting scraper ' + self.name) runner = CrawlerRunner({ 'USER_AGENT': get_random_user_agent.get_random_user_agent(), 'FEED_FORMAT': 'json', 'FEED_URI': self.new_file_name, 'AUTOTHROTTLE_ENABLED': 'True', 'DUPEFILTER_DEBUG': 'True' }) #runner.signals.connect(self.handle_spider_close, signals.spider_closed) # todo deferred spider or something like # https://kirankoduru.github.io/python/multiple-scrapy-spiders.html _d = runner.crawl(self.spider) # stop the reactor when we're done _d.addBoth(lambda _: reactor.stop()) # http://twistedmatrix.com/documents/9.0.0/core/howto/deferredindepth.html#auto7 # https://twistedmatrix.com/documents/17.9.0/api/twisted.internet.defer.Deferred.html signal.signal(signal.SIGINT, self.terminate) signal.signal(signal.SIGTERM, self.terminate) # crawler = Crawler(self.spider, { # 'USER_AGENT': get_random_user_agent.get_random_user_agent(), # 'FEED_FORMAT': 'json', # 'FEED_URI': self.new_file_name, # 'AUTOTHROTTLE_ENABLED': 'True', # 'DUPEFILTER_DEBUG': 'True' # }) # crawler.signals.connect(self.handle_spider_close, signal=signals.spider_closed) # # deferred = crawler.crawl() # deferred.addBoth(lambda _: self.handle_spider_done) reactor.run() return True, None except KeyboardInterrupt: raise KeyboardInterrupt("KeyboardInterrupt caught in run") except Exception as _e: exc_type, exc_value, exec_tb = sys.exc_info() return False, 'Caught ' \ + str("".join(traceback.format_exception(exc_type, exc_value, exec_tb)))