Пример #1
0
def crawler_start(usage, tasks):
    """Start specified spiders or validators from cmd with scrapy core api.
    There are four kinds of spiders: common, ajax, gfw, ajax_gfw. If you don't
    assign any tasks, all these spiders will run.
    """
    maps = CRAWLER_TASK_MAPS if usage == 'crawler' else TEMP_TASK_MAPS
    origin_spiders = DEFAULT_CRAWLERS if usage == 'crawler' else DEFAULT_VALIDATORS
    if not tasks:
        spiders = origin_spiders
    else:
        spiders = list()
        cases = list(map(BaseCase, origin_spiders))
        for task in tasks:
            for case in cases:
                if case.check(task, maps):
                    spiders.append(case.spider)
                    break
            else:
                # crawler_logger.warning('spider task {} is an invalid task, the allowed tasks are {}'.format(
                #     task, list(maps.keys())))
                pass
    if not spiders:
        #crawler_logger.warning('no spider starts up, please check your task input')
        return

    settings = get_project_settings()
    configure_logging(settings)
    runner = CrawlerRunner(settings)
    for spider in spiders:
        runner.crawl(spider)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
Пример #2
0
def run_spider():
	options = {
	    'CONCURRENT_ITEMS': 250,
	    'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
	    'CONCURRENT_REQUESTS': 30,
	    'DOWNLOAD_DELAY': 0.5,
	    'COOKIES_ENABLED': False,
	    }

	settings = get_project_settings()
	configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
	settings.update(options);

	#BookToscrapeSpider basic version
	from tutorial.spiders.booktoscrape_basic import BookToscrapeSpider
	#runner = CrawlerRunner(settings)
	#runner.crawl(BookToscrapeSpider())

	#BookToscrapeSpider crawl version
	from tutorial.spiders.booktoscrape_crawl import BookToscrapeSpider as BookToscrapeSpider_crawl
	runner = CrawlerRunner(settings)
	runner.crawl(BookToscrapeSpider_crawl())

    #crawler = Crawler(settings)
    #crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    #crawler.install()
    #crawler.configure()
    #crawler.crawl(spider)
    #crawler.start()
    #log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
	d= runner.join()
	d.addBoth(lambda _:reactor.stop())

	reactor.run()
def run():
    options = {
        'CONCURRENT_ITEMS': 250,
        #'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
        'CONCURRENT_REQUESTS': 30,
        'DOWNLOAD_DELAY': 0.5,
        'COOKIES_ENABLED': False,
        }

    spider = EntertainmentcareersSpider()

    settings = get_project_settings()
    settings.update(options)

    runner= CrawlerRunner(settings)
    runner.crawl(spider)

    d= runner.join()
    d.addBoth(lambda _:reactor.stop())
    #crawler = Crawler(settings)
    #crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    #crawler.install()
    #crawler.configure()
    #crawler.crawl(spider)
    #crawler.start()
    #log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
    reactor.run()
Пример #4
0
def crawl_articles(spids):
    settings = get_project_settings()
    configure_logging(settings, install_root_handler=False)
    logging.getLogger('scrapy').setLevel(logging.WARNING)
    runner = CrawlerRunner(settings)
    loader = runner.spider_loader
    if 'all' in spids:
        spids = loader.list()
    spiders = [
        loader.load(spid)
        for spid in spids
        if spid in loader.list()
    ]
    if not spiders:
        return
    random.shuffle(spiders)
    for spider in spiders:
        runner.crawl(spider)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    logger.info('crawl job starting...')
    try:
        reactor.run()
    except Exception:
        logger.exception('crawl job got exception:')
    logger.info('crawl job finished')
Пример #5
0
    def test_start_requests_dupes(self):
        settings = {"CONCURRENT_REQUESTS": 1}
        crawler = CrawlerRunner(settings).create_crawler(DuplicateStartRequestsSpider)
        yield crawler.crawl(dont_filter=True, distinct_urls=2, dupe_factor=3, mockserver=self.mockserver)
        self.assertEqual(crawler.spider.visited, 6)

        yield crawler.crawl(dont_filter=False, distinct_urls=3, dupe_factor=4, mockserver=self.mockserver)
        self.assertEqual(crawler.spider.visited, 3)
    def runProcess(self):
        configure_logging()
        dbHandler.check_watches()
        runner = CrawlerRunner()
        runner.crawl(spider.available_courses_spider)
        dbHandler.check_watches()
        d = runner.join()
        d.addBoth(lambda _: reactor.stop())

        reactor.run()
def runSpider(host, spider):
    spiders = spider.split(',')
    changeSettings(host)
    settings = get_project_settings()
    runner = CrawlerRunner(settings)
    for i in spiders:
        runner.crawl(SPIDER_MATCHER[i.lower()])

    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
Пример #8
0
 def test_timeout_failure(self):
     crawler = CrawlerRunner({"DOWNLOAD_TIMEOUT": 0.35}).create_crawler(DelaySpider)
     yield crawler.crawl(n=0.5, mockserver=self.mockserver)
     self.assertTrue(crawler.spider.t1 > 0)
     self.assertTrue(crawler.spider.t2 == 0)
     self.assertTrue(crawler.spider.t2_err > 0)
     self.assertTrue(crawler.spider.t2_err > crawler.spider.t1)
     # server hangs after receiving response headers
     yield crawler.crawl(n=0.5, b=1, mockserver=self.mockserver)
     self.assertTrue(crawler.spider.t1 > 0)
     self.assertTrue(crawler.spider.t2 == 0)
     self.assertTrue(crawler.spider.t2_err > 0)
     self.assertTrue(crawler.spider.t2_err > crawler.spider.t1)
Пример #9
0
    def test_crawler_runner_bootstrap_failed_for_several(self):
        runner = CrawlerRunner()

        try:
            yield runner.crawl(ExceptionSpider)
        except ValueError:
            pass
        else:
            self.fail('Exception should be raised from spider')

        yield runner.crawl(NoRequestsSpider)

        self.assertEqual(runner.bootstrap_failed, True)
Пример #10
0
def startprocess(queue):
	runner = CrawlerRunner(get_project_settings())
	dfs = set()
	
	l = runner.crawl('linkspider', website='http://caffe.berkeleyvision.org/', domain='berkeleyvision.org').addCallback(test,queue)
			#回调函数中参数1表示linkspider
	dfs.add(l)
	
	s = runner.crawl('srcspider', website='http://caffe.berkeleyvision.org/', domain='berkeleyvision.org').addCallback(test,queue)#回调函数中参数2表示srcspider
	dfs.add(s)
	c = runner.crawl('codespider', website='http://caffe.berkeleyvision.org/', domain='berkeleyvision.org').addCallback(test,queue)#回调函数中参数3表示codespider
	dfs.add(c)
	defer.DeferredList(dfs).addBoth(lambda _: reactor.stop())
			# the script will block here until all crawling jobs are finished
	reactor.run()
Пример #11
0
 def handle_lj(self):
     configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
     runner = CrawlerRunner(crawler_setting)
     #d = runner.crawl(HouseSpider)
     d = runner.crawl(LianjiaHouseSpider)
     d.addBoth(lambda _: reactor.stop())
     reactor.run()
Пример #12
0
    def test_crawler_process(self):
        runner = CrawlerRunner(self.settings)
        d = runner.crawl(CustomSpider)
        d.addBoth(lambda _: reactor.stop())
        # add crawl to redis
        key = "test-spider:istresearch.com:queue"
        self.redis_conn.zadd(key, self.example_feed, -99)

        # run the spider, give 20 seconds to see the url, crawl it,
        # and send to kafka. Then we kill the reactor
        def thread_func():
            time.sleep(20)
            reactor.stop()

        thread = threading.Thread(target=thread_func)
        thread.start()
        reactor.run()

        message_count = 0
        m = next(self.consumer)

        if m is None:
            pass
        else:
            the_dict = json.loads(m.value)
            if the_dict is not None and the_dict['appid'] == 'test' \
                    and the_dict['crawlid'] == 'abc12345':
                message_count += 1

        self.assertEquals(message_count, 1)
Пример #13
0
def main():
    configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
    settings = get_project_settings()
    runner = CrawlerRunner(settings)
    

    # settings.set('FEED_FORMAT','json')
    # settings.set('FEED_URI', 'result.json')

    runner.crawl(PttBoard)
    runner.crawl(PTTArticle)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    result = reactor.run() # the script will block here until the crawling is finished

    print result
Пример #14
0
class CrawlerRunnerTest(unittest.TestCase):

    def setUp(self):
        self.crawler_runner = CrawlerRunner(Settings())

    def tearDown(self):
        return self.crawler_runner.stop()

    @defer.inlineCallbacks
    def test_populate_spidercls_settings(self):
        spider_settings = {'TEST1': 'spider', 'TEST2': 'spider'}
        project_settings = {'TEST1': 'project', 'TEST3': 'project'}

        class CustomSettingsSpider(DefaultSpider):
            custom_settings = spider_settings

        self.crawler_runner.settings.setdict(project_settings,
                                             priority='project')

        d = self.crawler_runner.crawl(CustomSettingsSpider)
        crawler = list(self.crawler_runner.crawlers)[0]
        yield d
        self.assertEqual(crawler.settings.get('TEST1'), 'spider')
        self.assertEqual(crawler.settings.get('TEST2'), 'spider')
        self.assertEqual(crawler.settings.get('TEST3'), 'project')
Пример #15
0
def run_crawler_by_runner():
    runner = CrawlerRunner(get_project_settings())
    
    [runner.crawl(spider) for spider in spiders]
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
Пример #16
0
    def test_same_url(self):

        class TestSameUrlSpider(Spider):
            name = 'test_same_url'

            def __init__(self, *args, **kwargs):
                super(TestSameUrlSpider, self).__init__(*args, **kwargs)
                self.visited = 0

            def start_requests(s):
                return self.conman.from_spider(s, self.results)

            def parse_first(self, response):
                self.visited += 1
                return TestItem()

            def parse_second(self, response):
                self.visited += 1
                return TestItem()

        with MockServer() as mockserver:
            contract_doc = '@url {}'.format(mockserver.url('/status?n=200'))

            get_unbound_function(TestSameUrlSpider.parse_first).__doc__ = contract_doc
            get_unbound_function(TestSameUrlSpider.parse_second).__doc__ = contract_doc

            crawler = CrawlerRunner().create_crawler(TestSameUrlSpider)
            yield crawler.crawl()

        self.assertEqual(crawler.spider.visited, 2)
 def crawl(self):
     spider = Scrapy_ModuleSpider()
     Runner = CrawlerRunner(self.Scrapy_Module_setting)
     cra = Runner.crawl(spider)
     # stop reactor when spider closes
     cra.addBoth(lambda _: self.spider_closing(cra))
     self.logger.info("Run reactor")
     reactor.run()
Пример #18
0
def webcrawl(queue,webs,dom):
	website = ''
	domain = ''
	try:
		runner = CrawlerRunner(get_project_settings())
		dfs = set()
		l = runner.crawl('linkspider', website=webs, domain=dom).addCallback(setflag, queue).addErrback(err, queue)
			#回调函数中参数1表示linkspider
		dfs.add(l)
		s = runner.crawl('srcspider',  website=webs, domain=dom).addCallback(setflag, queue).addErrback(err, queue)#回调函数中参数2表示srcspider
		dfs.add(s)
		c = runner.crawl('codespider', website=webs, domain=dom).addCallback(setflag, queue).addErrback(err, queue)#回调函数中参数3表示codespider
		dfs.add(c)
		defer.DeferredList(dfs).addBoth(lambda _: reactor.stop())
			# the script will block here until all crawling jobs are finished
		reactor.run()
	except Exception,e:
		print e
Пример #19
0
 def runSpider(self, spider):
     configure_logging({'LOG_FORMAT': '%(asctime)s [%(name)s] %(levelname)s: %(message)s'})
     settings = Settings()
     settings.set('FEED_URI', 'output.json')
     settings.set('FEED_FORMAT', 'json')
     
     runner = CrawlerRunner(settings)
     dfd = runner.crawl(spider)
     dfd.addBoth(lambda _: reactor.stop())
Пример #20
0
class Runner(object):
    def __init__(self,*args,**kwargs): 
        configure_logging()
        self.settings = get_project_settings()
        self.runner = CrawlerRunner(self.settings) 

    def add(self,*a,**kw):  
        crawler = Crawler(BroadSpider,self.settings) 
        self.runner.crawl(crawler,*a,**kw)

    def start(self): 
        d = self.runner.join()
        d.addBoth(lambda _: reactor.stop()) 
        reactor.run()

    def stop(self):
        self.runner.stop()
        reactor.stop()
Пример #21
0
 def _test_delay(self, delay, randomize):
     settings = {"DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize}
     crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider)
     yield crawler.crawl(maxlatency=delay * 2, mockserver=self.mockserver)
     t = crawler.spider.times
     totaltime = t[-1] - t[0]
     avgd = totaltime / (len(t) - 1)
     tolerance = 0.6 if randomize else 0.2
     self.assertTrue(avgd > delay * (1 - tolerance),
                     "download delay too small: %s" % avgd)
Пример #22
0
def main():
    locale.setlocale(locale.LC_TIME, 'es_ES')

    configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
    runner = CrawlerRunner()

    d = runner.crawl(LotoSpider)
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
    return None
Пример #23
0
def run_login_spider(seed_url, username, password, db_name, logfile = "results.log"):

    init_db(db_name)
    settings = get_project_settings()
    runner = CrawlerRunner(settings)
    d = runner.crawl(LoginFinderSpider, seed_url = seed_url, username = username, password = password)
    d.addBoth(lambda _: reactor.stop())
    log.start(loglevel=log.DEBUG, logfile=logfile)
    log.msg("Item pipelines enabled: %s" % str(settings.get("ITEM_PIPELINES")), level = log.INFO)
    reactor.run()
Пример #24
0
 def crawl(self):
     os.environ['SCRAPY_PROJECT'] = '{0}/{1}'.format(BASE_DIR, 'collector')
     runner = CrawlerRunner({'LOG_LEVEL': 'WARNING',
                             'LOG_FORMATTER': 'collector.collector.utils.PoliteLogFormatter',
                             'ITEM_PIPELINES': {
                                 'collector.collector.pipelines.CodingDuplicatesPipeline': 1,
                                 'collector.collector.pipelines.CodingPriorityPipeline': 2
                             }})
     # runner = CrawlerRunner()
     d = runner.crawl(CodingProjectSpider)
     d.addBoth(lambda _: reactor.stop())
     reactor.run()
Пример #25
0
    def run_and_export(self, spider_cls, settings=None):
        """ Run spider with specified settings; return exported data. """
        tmpdir = tempfile.mkdtemp()
        res_name = tmpdir + "/res"
        defaults = {"FEED_URI": "file://" + res_name, "FEED_FORMAT": "csv"}
        defaults.update(settings or {})
        try:
            with MockServer() as s:
                runner = CrawlerRunner(Settings(defaults))
                yield runner.crawl(spider_cls)

            with open(res_name, "rb") as f:
                defer.returnValue(f.read())

        finally:
            shutil.rmtree(tmpdir)
Пример #26
0
def perform_scrape():
    '''Perform a MunchSpider scrape using the current Scrapy Settings
    '''
	settings = scrapingtools.get_settings()
	publisher_database = get_publisher_database(settings,mongo=False)
	configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
	(doi_links,doi_sources) = get_joblist(settings.get('COLLECT_FILE_NAME'))
	domains = get_domains(publisher_database)
	runner=CrawlerRunner(settings)
	d=runner.crawl(Spiders.MunchSpider.MunchSpider,
			 start_urls=doi_links,
			 crossref_items = doi_sources,
			 allowed_domains=domains,
			 publisher_database=publisher_database,
			 )
	d2=d.addBoth(lambda _: reactor.stop())
	d2.addCallback(lambda _: scrapingtools.finalise_file(settings.get('COMPLETE_FILE_NAME')))
	d2.addCallback(lambda _: scrapingtools.finalise_file(settings.get('ERROR_FILE_NAME')))
Пример #27
0
def _run_feed_spider(url, feed):
    spid = str(uuid.uuid4())
    feed['_id'] = spid
    configure_logging(TEST_SETTINGS, install_root_handler=False)
    logging.getLogger('scrapy').setLevel(logging.WARNING)
    save_feed(url)
    cls = SpiderFactory.create_spider(feed)
    runner = CrawlerRunner(TEST_SETTINGS)
    d = runner.crawl(cls)
    d.addBoth(lambda _: reactor.stop())
    reactor.run(installSignalHandlers=False)
    n = get_stats([spid])[spid]
    if n == 0:
        raise Exception(f'feed spider crawled 0 articles')
    if is_exists_spider(url):
        raise Exception(f'feed[{url}] existed')
    del feed['_id']
    save_spider_settings(feed)
def run_spiders():
    """
    说明:
        如果该调用程序是程序的最外层循环,那么此处可以直接调用爬虫的配置文件:
        在文件中使用如下代码:
        from scrapy.utils.project import get_project_settings
        # some code
        runner = CrawlerRunner(get_project_settings())

        如果该程序调用只是一个封装的函数,则配置文件需要自己构造,如下面的代码
    """
    configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})  # 定义日志格式
    # 设置当前爬虫的配置信息,此处是选择要调用的pipe
    settings = Settings()
    settings.set('ITEM_PIPELINES', {'spider.tutorial.pipelines.TutorialPipeline': 300,})
    # 将加载后的配置文件加载到爬虫中
    runner = CrawlerRunner(settings)  # 启用爬虫运行器

    d = runner.crawl(ChinazSpider)
    d.addBoth(lambda _: reactor.stop())
    reactor.run()  # the script will block here until the crawling is finished
Пример #29
0
    def run_and_export(self, spider_cls, settings=None):
        """ Run spider with specified settings; return exported data. """
        tmpdir = tempfile.mkdtemp()
        res_path = os.path.join(tmpdir, 'res')
        res_uri = urljoin('file:', pathname2url(res_path))
        defaults = {
            'FEED_URI': res_uri,
            'FEED_FORMAT': 'csv',
        }
        defaults.update(settings or {})
        try:
            with MockServer() as s:
                runner = CrawlerRunner(Settings(defaults))
                spider_cls.start_urls = [s.url('/')]
                yield runner.crawl(spider_cls)

            with open(res_path, 'rb') as f:
                content = f.read()

        finally:
            shutil.rmtree(tmpdir, ignore_errors=True)

        defer.returnValue(content)
Пример #30
0
# coding: utf-8
from __future__ import unicode_literals
from __future__ import absolute_import

import sys
import os
import logging
from settings import BASE_DIR, USER_AGENTS

# PRO_PATH = '{0}/{1}'.format(BASE_DIR, 'shadow')
sys.path.append(BASE_DIR)
os.environ['SCRAPY_PROJECT'] = BASE_DIR

from Shadow.spiders.zhihu_spider import ZHPeopleColumnSpider
from scrapy.conf import settings
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner

settings.overrides.update({'USER_AGENTS': USER_AGENTS})
count = 1
while 1:
    logging.info('start crawl people column by {0} times'.format(count))
    process = CrawlerRunner(settings)
    d = process.crawl(ZHPeopleColumnSpider)
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
Пример #31
0
        newFp.close()
        fp.close()

    def transfer(self,a):
        b=''
        for i in a.split('\\'):
            if(len(i) == 0):
                continue
            if(i[0] == 't'):
                b += '\t'
                if(len(i)>1):
                    b += i[1:]
                continue
            if(i[0] =='n'):
                b += '\n'
                if(len(i)>1):
                    b += i[1:]
                continue
            if(i[0] == 'r'):
                b += '\r'
                if(len(i)>1):
                    b += i[1:]
                continue
            b += i
        return b

runner = CrawlerRunner()
d = runner.crawl(MySpider)
d.addBoth(lambda _: reactor.stop())
Пример #32
0
def scrape_with_crochet(spider):
    global data
    data = []
    crawl_runner = CrawlerRunner()
    dispatcher.connect(process_result, signal=item_scraped)
    return crawl_runner.crawl(spider)
Пример #33
0
    def handle(self, *args, **options):
        configure_logging()
        process = CrawlerRunner(get_project_settings())

        # film crawlers
        process.crawl('acfilm')
        process.crawl('fpp')
        process.crawl('retrospekt')
        process.crawl('brooklyn-film')
        process.crawl('precision-film')
        process.crawl('bhfilm')
        process.crawl('freestyle')
        process.crawl('moment')
        process.crawl('ultrafine')

        # camera crawlers
        process.crawl('brooklyn')
        process.crawl('austin_camera')
        process.crawl('precision')
        process.crawl('keh')
        process.crawl('bh')
        # not super impressed with etsy, tbh
        # process.crawl('etsy')

        d = process.join()
        d.addBoth(lambda _: reactor.stop())

        reactor.run()
Пример #34
0
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
Topic: sample
Desc :
"""

import logging
from spiders.user_relationship_nets import UserRelationshipNetsSpider
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
from scrapy.utils.log import configure_logging

if __name__ == '__main__':
    settings = get_project_settings()
    configure_logging(settings)
    runner = CrawlerRunner(settings)

    runner.crawl(UserRelationshipNetsSpider, 'xrcy168', runner)

    d = runner.join()
    d.addBoth(lambda _: reactor.stop())

    reactor.run()
    logging.info('all finished.')
Пример #35
0
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
from scrapy.utils.log import configure_logging
from mengEventProject.models import db_connect
from mengEventProject.models import ArticleRule
from sqlalchemy.orm import sessionmaker
from mengEventProject.spiders.articleSpider import ArticleSpider

if __name__ == '__main__':
    settings = get_project_settings()
    configure_logging(settings)
    db = db_connect()
    Session = sessionmaker(bind=db)
    session = Session()
    rules = session.query(ArticleRule).filter(ArticleRule.enable == 1).all()
    session.close()
    runner = CrawlerRunner(settings)

    for rule in rules:
        # spider = ArticleSpider(rule)  # instantiate every spider using rule
        # stop reactor when spider closes
        # runner.signals.connect(spider_closing, signal=signals.spider_closed)
        runner.crawl(ArticleSpider, rule=rule)

    d = runner.join()
    d.addBoth(lambda _: reactor.stop())

    # blocks process so always keep as the last statement
    reactor.run()
    logging.info('all finished.')
Пример #36
0
from twisted.internet import reactor
import scrapy

from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging

from jobs.spiders.bulldogjob import BulldogjobSpider


settings = get_project_settings()
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner(settings)

d = runner.crawl(BulldogjobSpider)
d.addBoth(lambda _: reactor.stop())
reactor.run()
Пример #37
0
# -*- coding: utf-8 -*-

from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from twisted.internet import reactor

from cnblogSpider.spiders.cnblogs import CnblogsSpider
from cnblogSpider.spiders.douban import DoubanSpider
from cnblogSpider.spiders.zufang import ZufangSpider
from scrapy.utils.project import get_project_settings

if __name__ == '__main__':
    configure_logging()
    runner = CrawlerRunner(get_project_settings())
    runner.crawl(ZufangSpider)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
Пример #38
0
def prelaunch(user, **kwargs):
    if ('json_parser' in kwargs) and kwargs['json_parser']:
        json_parser = kwargs['json_parser']
    else:
        json_parser = False

    if ('csv_parser' in kwargs) and kwargs['csv_parser']:
        csv_parser = kwargs['csv_parser']
    else:
        csv_parser = False

    if ('start_time' in kwargs) and kwargs['start_time']:
        start_time = kwargs['start_time']
    else:
        start_time = ''

    settings = get_project_settings()
    if (json_parser == True) and (csv_parser == True):
        settings.set(
            "FEEDS", {
                "%s_posts.json" % user: {
                    'format': 'json',
                    'encoding': 'utf8'
                },
                pathlib.Path('%s_posts.csv' % user): {
                    'format': 'csv'
                },
            })
    elif ((json_parser == True) and
          (csv_parser == False)) or ((json_parser == False) and
                                     (csv_parser == False)):
        settings.set("FEEDS", {
            "%s_posts.json" % user: {
                "format": "json",
                'encoding': 'utf8'
            },
        })
    elif (csv_parser == True) and (json_parser == False):
        settings.set("FEEDS", {
            pathlib.Path('%s_posts.csv' % user): {
                'format': 'csv'
            },
        })

    # configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s' })
    # logging.getLogger('scrapy').propagate = False
    runner = CrawlerRunner(settings)

    d = runner.crawl(ProfilSpider, profil=user, time=start_time)
    d.addBoth(lambda _: reactor.stop())
    reactor.run()  # the script will block here until the crawling is finished

    if (json_parser == False) and (csv_parser == False):
        try:
            with open("%s_posts.json" % user, encoding='UTF-8') as f:
                data = json.load(f)

            # os.remove("%s_posts.json" % user)

            return data
        except:
            return 'cannot parse data'
Пример #39
0
                dont_filter=True
            )
    
    def parse(self, response):
        # load IP which made the request from json string
        ip = loads(response.text)['origin']
        yield {'ip': ip}

def get_settings() -> Settings:
    settings = Settings()
    # Enter your package credentials here!
    settings.set('PROXYLAND', {
        'username': '******',
        'password': '******'
    })
    # enable ProxylandMiddleware and HttpProxyMiddleware
    settings.set('DOWNLOADER_MIDDLEWARES', {
        'middleware.ProxylandMiddleware': 350,
        'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400,
    })
    return settings

if __name__ == '__main__':
    # routine to run scrapy from a script
    # see: https://docs.scrapy.org/en/latest/topics/practices.html#run-scrapy-from-a-script
    settings = get_settings()
    runner = CrawlerRunner(settings)
    d = runner.crawl(IpSpider)
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
Пример #40
0
class ScrAPI(Flask):
    def __init__(self, import_name=__package__, **kwargs):
        super(ScrAPI, self).__init__(import_name, **kwargs)
        configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
        self._init_url_rules()
        self.process = CrawlerRunner(get_project_settings())
        self.tp = reactor.getThreadPool()
        self.database = DatabaseConnector(DATABASE_URL)
        self.response_meta = {"meta": {"project": "WSF Web Scraper"}}

    def __del__(self):
        self.database._close_all_spiders()
        self.database.cursor.close()
        self.database.connection.close()

    def run(self, host=None, port=None, debug=None, **options):
        super(ScrAPI, self).run(host, port, debug, **options)

    def _get_meta_response(self, res):
        res.update(self.response_meta)
        return res

    def _init_url_rules(self):
        """Attach the endpoints to run spiders and list the spiders
        that are available in the API
        """

        self.add_url_rule(
            '/spiders',
            view_func=self.list_spiders,
            methods=['GET'],
        )
        self.add_url_rule(
            '/spiders',
            view_func=self.run_spider,
            methods=['POST'],
        )
        self.add_url_rule(
            '/spiders/<int:spider_id>',
            view_func=self.close_spider,
            methods=['DELETE'],
        )
        self.add_url_rule(
            '/database',
            view_func=self.import_db,
            methods=['POST'],
        )
        self.add_url_rule(
            '/database',
            view_func=self.export_db,
            methods=['GET'],
        )
        self.add_url_rule(
            '/database',
            view_func=self.clear_scraps,
            methods=['DELETE'],
        )
        self.add_url_rule(
            '/crawls',
            view_func=self.list_crawls,
            methods=['GET'],
        )
        self.add_url_rule(
            '/crawls',
            view_func=self.stop,
            methods=['DELETE'],
        )
        self.add_url_rule(
            '/',
            view_func=self.home,
            methods=['GET'],
        )

    def home(self):
        routes = [{
            "url": "/spiders",
            "method": "GET"
        }, {
            "url": "/spiders",
            "method": "POST",
            "arguments": {
                "spider": "name of the spider to run"
            }
        }, {
            "url": "/spiders/:spider_id",
            "method": "DELETE",
            "arguments": {
                "spider_id": "uuid of the spider to close"
            }
        }, {
            "url": "/crawls",
            "method": "GET"
        }, {
            "url": "/crawls",
            "method": "DELETE"
        }, {
            "url": "/database",
            "method": "GET"
        }, {
            "url": "/database",
            "method": "POST",
            "arguments": {
                "file": "json file containing the database dump"
            }
        }, {
            "url": "/database",
            "method": "DELETE"
        }]
        result = self._get_meta_response({"routes": routes})
        return jsonify(result), 200

    def list_spiders(self):
        spiders = self.process.spider_loader.list()
        return jsonify({"spiders": spiders, "status": "success"}), 200

    def run_spider(self):
        post_data = request.get_json()
        spider = post_data.get('spider')
        if spider == 'who_iris':
            spider = who_iris_spider.WhoIrisSpider()
        elif spider == 'nice':
            spider = nice_spider.NiceSpider()
        else:
            return '', 404
        spider_id = str(uuid.uuid4())
        self.process.crawl(spider, uuid=spider_id)
        crawl = self.process.join()
        self.database.insert_spider(spider.name, spider_id)
        crawl.addBoth(self.on_success)
        return jsonify({
            "data": {
                "status": "running",
                "spider": spider.name,
                "_id": spider_id
            }
        }), 200

    def on_success(self, data):
        self.database._close_all_spiders()

    def close_spider(self, spider_id):
        for crawl in self.process.crawlers:
            if crawl.spider.uuid == uuid:
                crawl.stop()
                return jsonify(
                    {"data": {
                        "status": "success",
                        "_id": spider_id
                    }}), 200
        return '', 400

    def list_crawls(self):
        crawls = self.process.crawlers
        running_spiders = []
        for crawl in crawls:
            start_time = crawl.stats.get_value('start_time')
            spider = {
                '_id':
                crawl.spider.uuid,
                'spider':
                crawl.spider.name,
                'start_time':
                start_time,
                'total_time':
                str(datetime.now() - start_time),
                'item_dropped':
                crawl.stats.get_value('item_dropped_count'),
                'item_scraped':
                crawl.stats.get_value('item_scraped_count'),
                'total_requests':
                crawl.stats.get_value('downloader/request_count'),
            }

            running_spiders.append(spider)
        finished_spiders = []
        for spider in self.database.get_finished_crawls():
            finished_spiders.append(spider)
        spiders = {"crawling": running_spiders, "finished": finished_spiders}
        return jsonify({"data": {"spiders": spiders}}), 200

    def stop(self):
        self.process.stop()
        return jsonify({"data": {"status": "success"}}), 200

    def export_db(self):
        articles_rows = self.database.get_articles()
        articles = []
        now = datetime.now()
        for title, file_hash, url in articles_rows:
            articles.append({
                'title': title,
                'file_hash': file_hash,
                'url': url,
            })
        json_file = tempfile.NamedTemporaryFile()
        json_file.write(json.dumps(articles).encode('utf-8'))
        json_file.seek(0)
        return send_file(json_file,
                         mimetype='application/json',
                         as_attachment=True,
                         attachment_filename=f'export-{now}.json')

    def import_db(self):
        if request.files:
            data_file = request.files.get('file')
            if data_file.filename == '':
                return 'Filename must not be blank', 400
            if data_file.content_type == 'application/json':
                json_file = data_file.stream.read()
            else:
                return 'File format is not json.', 415

            try:
                json_dict = json.loads(json_file)
                for article in json_dict:
                    self.database.insert_article(article.get('title'),
                                                 article.get('file_hash'),
                                                 article.get('url'))

                return '', 201
            except Exception as e:
                result = {"errors": [str(e)]}
                return jsonify(result), 400
        else:
            return 'No JSON file in request', 400

    def clear_scraps(self):
        try:
            self.database.reset_scraped()
            return '', 204
        except Exception as e:
            return str(e), 500
Пример #41
0
 def Crawl_job(self, URL, Next):
     Runner = CrawlerRunner(settings=self.spider_settings)
     return Runner.crawl(eval(self.SpiderName), profile_url=URL, next=Next)
Пример #42
0
from scrapy.crawler import CrawlerRunner
from twisted.internet import reactor
from scrapy.utils.project import get_project_settings
from weibo_scrapy.spiders.weibo import WeiboSpider
from weibo_scrapy.spiders.weibo_comment import CommentSpider
from weibo_scrapy.spiders.weibo_repost import RepostSpider
from scrapy.utils.log import configure_logging

# 传入两个参数 {type} {line}  例如:python crawler_run.py weibo 1,重庆发布,1988438334,20,False@_@
# type取值有:weibo | repost | comment
if __name__ == '__main__':
    configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
    runner = CrawlerRunner(get_project_settings())

    if sys.argv[1] == 'weibo':
        d = runner.crawl(WeiboSpider, sys.argv[2])
        d.addBoth(lambda _: reactor.stop())
    elif sys.argv[1] == 'repost':
        d = runner.crawl(RepostSpider, sys.argv[2])
        d.addBoth(lambda _: reactor.stop())
    elif sys.argv[1] == 'comment':
        d = runner.crawl(CommentSpider, sys.argv[2])
        d.addBoth(lambda _: reactor.stop())

    reactor.run()

# 基于用户id搜索: 第一位:type = 1 第二位:用户名  第三位:用户id 第四位:页数 第五位:是否要图片
# 例子:   1,重庆发布,1988438334,3,True
# execute(("scrapy crawl weibo -a line=1,重庆发布,1988438334,20,False@_@").split(" "))

# 基于关键词搜索:  第一位:type = 2   第二位:关键词    第三位:1-综合 60-热门 61-实时  第四位:页数 第五位:是否要图片
Пример #43
0
            '//*[@id="ficha_producto_int"]/h1/text()').extract_first()
        ml_item['precio'] = response.xpath(
            '//*[@id="PriceProduct"]/text()[not(parent::span[@class="SignoPriceProduct"])and normalize-space()]'
        ).extract()
        ml_item['link'] = response.xpath(
            '//*[@id="HeaderInfoMiddlePerfil_Box_2"]/a/@href').extract()
        ml_item['sku'] = response.xpath(
            '//*[@id="imagen_producto"]/div[@class="dvInfoGral"][1]/span[@class="txValueInfoGral"]/text()'
        ).extract()
        ml_item['plataforma'] = response.xpath(
            '//*[@id="imagen_producto"]/div[@class="dvInfoGral"][2]/span[@class="txValueInfoGral"]/text()'
        ).extract()

        self.item_count += 1
        if self.item_count > 100:
            raise CloseSpider('item_exceeded')
        yield ml_item


configure_logging()
runner = CrawlerRunner()
runner.crawl(SVGSpiderPS4)
runner.crawl(SVGSpiderPS3)
runner.crawl(SVGSpiderXONE)
runner.crawl(SVGSpiderPSVITA)
runner.crawl(SVGSpiderNSWI)
runner.crawl(SVGSpiderWIIU)
d = runner.join()
d.addBoth(lambda _: reactor.stop())

reactor.run()
 def test_start_requests_lazyness(self):
     settings = {"CONCURRENT_REQUESTS": 1}
     crawler = CrawlerRunner(settings).create_crawler(
         BrokenStartRequestsSpider)
     yield crawler.crawl(mockserver=self.mockserver)
class CrawlTestCase(TestCase):
    def setUp(self):
        self.mockserver = MockServer()
        self.mockserver.__enter__()
        self.runner = CrawlerRunner()

    def tearDown(self):
        self.mockserver.__exit__(None, None, None)

    @defer.inlineCallbacks
    def test_follow_all(self):
        crawler = self.runner.create_crawler(FollowAllSpider)
        yield crawler.crawl(mockserver=self.mockserver)
        self.assertEqual(len(crawler.spider.urls_visited),
                         11)  # 10 + start_url

    @defer.inlineCallbacks
    def test_fixed_delay(self):
        yield self._test_delay(total=3, delay=0.2)

    @defer.inlineCallbacks
    def test_randomized_delay(self):
        yield self._test_delay(total=3, delay=0.1, randomize=True)

    @defer.inlineCallbacks
    def _test_delay(self, total, delay, randomize=False):
        crawl_kwargs = dict(
            maxlatency=delay * 2,
            mockserver=self.mockserver,
            total=total,
        )
        tolerance = (1 - (0.6 if randomize else 0.2))

        settings = {
            "DOWNLOAD_DELAY": delay,
            'RANDOMIZE_DOWNLOAD_DELAY': randomize
        }
        crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider)
        yield crawler.crawl(**crawl_kwargs)
        times = crawler.spider.times
        total_time = times[-1] - times[0]
        average = total_time / (len(times) - 1)
        self.assertTrue(average > delay * tolerance,
                        "download delay too small: %s" % average)

        # Ensure that the same test parameters would cause a failure if no
        # download delay is set. Otherwise, it means we are using a combination
        # of ``total`` and ``delay`` values that are too small for the test
        # code above to have any meaning.
        settings["DOWNLOAD_DELAY"] = 0
        crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider)
        yield crawler.crawl(**crawl_kwargs)
        times = crawler.spider.times
        total_time = times[-1] - times[0]
        average = total_time / (len(times) - 1)
        self.assertFalse(average > delay / tolerance,
                         "test total or delay values are too small")

    @defer.inlineCallbacks
    def test_timeout_success(self):
        crawler = self.runner.create_crawler(DelaySpider)
        yield crawler.crawl(n=0.5, mockserver=self.mockserver)
        self.assertTrue(crawler.spider.t1 > 0)
        self.assertTrue(crawler.spider.t2 > 0)
        self.assertTrue(crawler.spider.t2 > crawler.spider.t1)

    @defer.inlineCallbacks
    def test_timeout_failure(self):
        crawler = CrawlerRunner({
            "DOWNLOAD_TIMEOUT": 0.35
        }).create_crawler(DelaySpider)
        yield crawler.crawl(n=0.5, mockserver=self.mockserver)
        self.assertTrue(crawler.spider.t1 > 0)
        self.assertTrue(crawler.spider.t2 == 0)
        self.assertTrue(crawler.spider.t2_err > 0)
        self.assertTrue(crawler.spider.t2_err > crawler.spider.t1)
        # server hangs after receiving response headers
        yield crawler.crawl(n=0.5, b=1, mockserver=self.mockserver)
        self.assertTrue(crawler.spider.t1 > 0)
        self.assertTrue(crawler.spider.t2 == 0)
        self.assertTrue(crawler.spider.t2_err > 0)
        self.assertTrue(crawler.spider.t2_err > crawler.spider.t1)

    @defer.inlineCallbacks
    def test_retry_503(self):
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as l:
            yield crawler.crawl(self.mockserver.url("/status?n=503"),
                                mockserver=self.mockserver)
        self._assert_retried(l)

    @defer.inlineCallbacks
    def test_retry_conn_failed(self):
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as l:
            yield crawler.crawl("http://localhost:65432/status?n=503",
                                mockserver=self.mockserver)
        self._assert_retried(l)

    @defer.inlineCallbacks
    def test_retry_dns_error(self):
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as l:
            # try to fetch the homepage of a non-existent domain
            yield crawler.crawl("http://dns.resolution.invalid./",
                                mockserver=self.mockserver)
        self._assert_retried(l)

    @defer.inlineCallbacks
    def test_start_requests_bug_before_yield(self):
        with LogCapture('scrapy', level=logging.ERROR) as l:
            crawler = self.runner.create_crawler(BrokenStartRequestsSpider)
            yield crawler.crawl(fail_before_yield=1,
                                mockserver=self.mockserver)

        self.assertEqual(len(l.records), 1)
        record = l.records[0]
        self.assertIsNotNone(record.exc_info)
        self.assertIs(record.exc_info[0], ZeroDivisionError)

    @defer.inlineCallbacks
    def test_start_requests_bug_yielding(self):
        with LogCapture('scrapy', level=logging.ERROR) as l:
            crawler = self.runner.create_crawler(BrokenStartRequestsSpider)
            yield crawler.crawl(fail_yielding=1, mockserver=self.mockserver)

        self.assertEqual(len(l.records), 1)
        record = l.records[0]
        self.assertIsNotNone(record.exc_info)
        self.assertIs(record.exc_info[0], ZeroDivisionError)

    @defer.inlineCallbacks
    def test_start_requests_lazyness(self):
        settings = {"CONCURRENT_REQUESTS": 1}
        crawler = CrawlerRunner(settings).create_crawler(
            BrokenStartRequestsSpider)
        yield crawler.crawl(mockserver=self.mockserver)
        #self.assertTrue(False, crawler.spider.seedsseen)
        #self.assertTrue(crawler.spider.seedsseen.index(None) < crawler.spider.seedsseen.index(99),
        #                crawler.spider.seedsseen)

    @defer.inlineCallbacks
    def test_start_requests_dupes(self):
        settings = {"CONCURRENT_REQUESTS": 1}
        crawler = CrawlerRunner(settings).create_crawler(
            DuplicateStartRequestsSpider)
        yield crawler.crawl(dont_filter=True,
                            distinct_urls=2,
                            dupe_factor=3,
                            mockserver=self.mockserver)
        self.assertEqual(crawler.spider.visited, 6)

        yield crawler.crawl(dont_filter=False,
                            distinct_urls=3,
                            dupe_factor=4,
                            mockserver=self.mockserver)
        self.assertEqual(crawler.spider.visited, 3)

    @defer.inlineCallbacks
    def test_unbounded_response(self):
        # Completeness of responses without Content-Length or Transfer-Encoding
        # can not be determined, we treat them as valid but flagged as "partial"
        from urllib.parse import urlencode
        query = urlencode({
            'raw':
            '''\
HTTP/1.1 200 OK
Server: Apache-Coyote/1.1
X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0
Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/
Pragma: no-cache
Expires: Thu, 01 Jan 1970 00:00:00 GMT
Cache-Control: no-cache
Cache-Control: no-store
Content-Type: text/html;charset=UTF-8
Content-Language: en
Date: Tue, 27 Aug 2013 13:05:05 GMT
Connection: close

foo body
with multiples lines
'''
        })
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as l:
            yield crawler.crawl(self.mockserver.url("/raw?{0}".format(query)),
                                mockserver=self.mockserver)
        self.assertEqual(str(l).count("Got response 200"), 1)

    @defer.inlineCallbacks
    def test_retry_conn_lost(self):
        # connection lost after receiving data
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as l:
            yield crawler.crawl(self.mockserver.url("/drop?abort=0"),
                                mockserver=self.mockserver)
        self._assert_retried(l)

    @defer.inlineCallbacks
    def test_retry_conn_aborted(self):
        # connection lost before receiving data
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as l:
            yield crawler.crawl(self.mockserver.url("/drop?abort=1"),
                                mockserver=self.mockserver)
        self._assert_retried(l)

    def _assert_retried(self, log):
        self.assertEqual(str(log).count("Retrying"), 2)
        self.assertEqual(str(log).count("Gave up retrying"), 1)

    @defer.inlineCallbacks
    def test_referer_header(self):
        """Referer header is set by RefererMiddleware unless it is already set"""
        req0 = Request(self.mockserver.url('/echo?headers=1&body=0'),
                       dont_filter=1)
        req1 = req0.replace()
        req2 = req0.replace(headers={'Referer': None})
        req3 = req0.replace(headers={'Referer': 'http://example.com'})
        req0.meta['next'] = req1
        req1.meta['next'] = req2
        req2.meta['next'] = req3
        crawler = self.runner.create_crawler(SingleRequestSpider)
        yield crawler.crawl(seed=req0, mockserver=self.mockserver)
        # basic asserts in case of weird communication errors
        self.assertIn('responses', crawler.spider.meta)
        self.assertNotIn('failures', crawler.spider.meta)
        # start requests doesn't set Referer header
        echo0 = json.loads(to_unicode(
            crawler.spider.meta['responses'][2].body))
        self.assertNotIn('Referer', echo0['headers'])
        # following request sets Referer to start request url
        echo1 = json.loads(to_unicode(
            crawler.spider.meta['responses'][1].body))
        self.assertEqual(echo1['headers'].get('Referer'), [req0.url])
        # next request avoids Referer header
        echo2 = json.loads(to_unicode(
            crawler.spider.meta['responses'][2].body))
        self.assertNotIn('Referer', echo2['headers'])
        # last request explicitly sets a Referer header
        echo3 = json.loads(to_unicode(
            crawler.spider.meta['responses'][3].body))
        self.assertEqual(echo3['headers'].get('Referer'),
                         ['http://example.com'])

    @defer.inlineCallbacks
    def test_engine_status(self):
        from scrapy.utils.engine import get_engine_status
        est = []

        def cb(response):
            est.append(get_engine_status(crawler.engine))

        crawler = self.runner.create_crawler(SingleRequestSpider)
        yield crawler.crawl(seed=self.mockserver.url('/'),
                            callback_func=cb,
                            mockserver=self.mockserver)
        self.assertEqual(len(est), 1, est)
        s = dict(est[0])
        self.assertEqual(s['engine.spider.name'], crawler.spider.name)
        self.assertEqual(s['len(engine.scraper.slot.active)'], 1)

    @defer.inlineCallbacks
    def test_graceful_crawl_error_handling(self):
        """
        Test whether errors happening anywhere in Crawler.crawl() are properly
        reported (and not somehow swallowed) after a graceful engine shutdown.
        The errors should not come from within Scrapy's core but from within
        spiders/middlewares/etc., e.g. raised in Spider.start_requests(),
        SpiderMiddleware.process_start_requests(), etc.
        """
        class TestError(Exception):
            pass

        class FaultySpider(SimpleSpider):
            def start_requests(self):
                raise TestError

        crawler = self.runner.create_crawler(FaultySpider)
        yield self.assertFailure(crawler.crawl(mockserver=self.mockserver),
                                 TestError)
        self.assertFalse(crawler.crawling)

    @defer.inlineCallbacks
    def test_open_spider_error_on_faulty_pipeline(self):
        settings = {
            "ITEM_PIPELINES": {
                "tests.pipelines.ZeroDivisionErrorPipeline": 300,
            }
        }
        crawler = CrawlerRunner(settings).create_crawler(SimpleSpider)
        yield self.assertFailure(
            self.runner.crawl(crawler,
                              self.mockserver.url("/status?n=200"),
                              mockserver=self.mockserver), ZeroDivisionError)
        self.assertFalse(crawler.crawling)

    @defer.inlineCallbacks
    def test_crawlerrunner_accepts_crawler(self):
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as log:
            yield self.runner.crawl(crawler,
                                    self.mockserver.url("/status?n=200"),
                                    mockserver=self.mockserver)
        self.assertIn("Got response 200", str(log))

    @defer.inlineCallbacks
    def test_crawl_multiple(self):
        self.runner.crawl(SimpleSpider,
                          self.mockserver.url("/status?n=200"),
                          mockserver=self.mockserver)
        self.runner.crawl(SimpleSpider,
                          self.mockserver.url("/status?n=503"),
                          mockserver=self.mockserver)

        with LogCapture() as log:
            yield self.runner.join()

        self._assert_retried(log)
        self.assertIn("Got response 200", str(log))

    @defer.inlineCallbacks
    def test_crawlspider_with_errback(self):
        self.runner.crawl(CrawlSpiderWithErrback, mockserver=self.mockserver)

        with LogCapture() as log:
            yield self.runner.join()

        self.assertIn("[callback] status 200", str(log))
        self.assertIn("[callback] status 201", str(log))
        self.assertIn("[errback] status 404", str(log))
        self.assertIn("[errback] status 500", str(log))

    @defer.inlineCallbacks
    def test_async_def_parse(self):
        self.runner.crawl(AsyncDefSpider,
                          self.mockserver.url("/status?n=200"),
                          mockserver=self.mockserver)
        with LogCapture() as log:
            yield self.runner.join()
        self.assertIn("Got response 200", str(log))

    @mark.only_asyncio()
    @defer.inlineCallbacks
    def test_async_def_asyncio_parse(self):
        runner = CrawlerRunner({
            "TWISTED_REACTOR":
            "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
        })
        runner.crawl(AsyncDefAsyncioSpider,
                     self.mockserver.url("/status?n=200"),
                     mockserver=self.mockserver)
        with LogCapture() as log:
            yield runner.join()
        self.assertIn("Got response 200", str(log))

    @mark.only_asyncio()
    @defer.inlineCallbacks
    def test_async_def_asyncio_parse_items_list(self):
        items = []

        def _on_item_scraped(item):
            items.append(item)

        crawler = self.runner.create_crawler(AsyncDefAsyncioReturnSpider)
        crawler.signals.connect(_on_item_scraped, signals.item_scraped)
        with LogCapture() as log:
            yield crawler.crawl(self.mockserver.url("/status?n=200"),
                                mockserver=self.mockserver)
        self.assertIn("Got response 200", str(log))
        self.assertIn({'id': 1}, items)
        self.assertIn({'id': 2}, items)

    @mark.skipif(sys.version_info < (3, 6),
                 reason="Async generators require Python 3.6 or higher")
    @mark.only_asyncio()
    @defer.inlineCallbacks
    def test_async_def_asyncgen_parse(self):
        from tests.py36._test_crawl import AsyncDefAsyncioGenSpider
        crawler = self.runner.create_crawler(AsyncDefAsyncioGenSpider)
        with LogCapture() as log:
            yield crawler.crawl(self.mockserver.url("/status?n=200"),
                                mockserver=self.mockserver)
        self.assertIn("Got response 200", str(log))
        itemcount = crawler.stats.get_value('item_scraped_count')
        self.assertEqual(itemcount, 1)

    @mark.skipif(sys.version_info < (3, 6),
                 reason="Async generators require Python 3.6 or higher")
    @mark.only_asyncio()
    @defer.inlineCallbacks
    def test_async_def_asyncgen_parse_loop(self):
        items = []

        def _on_item_scraped(item):
            items.append(item)

        from tests.py36._test_crawl import AsyncDefAsyncioGenLoopSpider
        crawler = self.runner.create_crawler(AsyncDefAsyncioGenLoopSpider)
        crawler.signals.connect(_on_item_scraped, signals.item_scraped)
        with LogCapture() as log:
            yield crawler.crawl(self.mockserver.url("/status?n=200"),
                                mockserver=self.mockserver)
        self.assertIn("Got response 200", str(log))
        itemcount = crawler.stats.get_value('item_scraped_count')
        self.assertEqual(itemcount, 10)
        for i in range(10):
            self.assertIn({'foo': i}, items)

    @mark.skipif(sys.version_info < (3, 6),
                 reason="Async generators require Python 3.6 or higher")
    @mark.only_asyncio()
    @defer.inlineCallbacks
    def test_async_def_asyncgen_parse_complex(self):
        items = []

        def _on_item_scraped(item):
            items.append(item)

        from tests.py36._test_crawl import AsyncDefAsyncioGenComplexSpider
        crawler = self.runner.create_crawler(AsyncDefAsyncioGenComplexSpider)
        crawler.signals.connect(_on_item_scraped, signals.item_scraped)
        yield crawler.crawl(mockserver=self.mockserver)
        itemcount = crawler.stats.get_value('item_scraped_count')
        self.assertEqual(itemcount, 156)
        # some random items
        for i in [1, 4, 21, 22, 207, 311]:
            self.assertIn({'index': i}, items)
        for i in [10, 30, 122]:
            self.assertIn({'index2': i}, items)

    @mark.only_asyncio()
    @defer.inlineCallbacks
    def test_async_def_asyncio_parse_reqs_list(self):
        crawler = self.runner.create_crawler(AsyncDefAsyncioReqsReturnSpider)
        with LogCapture() as log:
            yield crawler.crawl(self.mockserver.url("/status?n=200"),
                                mockserver=self.mockserver)
        for req_id in range(3):
            self.assertIn("Got response 200, req_id %d" % req_id, str(log))

    @defer.inlineCallbacks
    def test_response_ssl_certificate_none(self):
        crawler = self.runner.create_crawler(SingleRequestSpider)
        url = self.mockserver.url("/echo?body=test", is_secure=False)
        yield crawler.crawl(seed=url, mockserver=self.mockserver)
        self.assertIsNone(crawler.spider.meta['responses'][0].certificate)

    @defer.inlineCallbacks
    def test_response_ssl_certificate(self):
        crawler = self.runner.create_crawler(SingleRequestSpider)
        url = self.mockserver.url("/echo?body=test", is_secure=True)
        yield crawler.crawl(seed=url, mockserver=self.mockserver)
        cert = crawler.spider.meta['responses'][0].certificate
        self.assertIsInstance(cert, Certificate)
        self.assertEqual(cert.getSubject().commonName, b"localhost")
        self.assertEqual(cert.getIssuer().commonName, b"localhost")

    @mark.xfail(
        reason="Responses with no body return early and contain no certificate"
    )
    @defer.inlineCallbacks
    def test_response_ssl_certificate_empty_response(self):
        crawler = self.runner.create_crawler(SingleRequestSpider)
        url = self.mockserver.url("/status?n=200", is_secure=True)
        yield crawler.crawl(seed=url, mockserver=self.mockserver)
        cert = crawler.spider.meta['responses'][0].certificate
        self.assertIsInstance(cert, Certificate)
        self.assertEqual(cert.getSubject().commonName, b"localhost")
        self.assertEqual(cert.getIssuer().commonName, b"localhost")
Пример #46
0
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging

from sina.spiders import weibo_spider

configure_logging()
runner = CrawlerRunner()
runner.crawl(weibo_spider)
d = runner.join()
d.addBoth(lambda _: reactor.stop())

reactor.run()
def run_spider(settings, spider):
    runner = CrawlerRunner(settings)
    deferred = runner.crawl(spider)
    return deferred
Пример #48
0
class ScheduleCrawlerRunner:
    @staticmethod
    def interval_to_app_task(interval='daily'):
        now = datetime.fromtimestamp(
            os.getenv('NOW', time.time()),
            pytz.timezone(os.getenv('APP_TIMEZONE', 'Asia/Chongqing')),
        )
        # %Y-%m-%d %H:%M:%S %z
        # applogger.debug('Interval %s convert to APP_TASK at %s', interval, now)
        formats = {
            'debug': lambda: now.strftime('%Y%m%d_%H%M'),
            'hourly': lambda: now.strftime('%Y%m%d_%H'),
            'daily': lambda: now.strftime('%Y%m%d'),
            'weekly': lambda: now.strftime('%YW%U'),
            'monthly': lambda: now.strftime('%Y%m'),
        }
        return formats[interval]()

    def __init__(self, spider_name: str):
        self.settings = get_project_settings()
        self.crawler = CrawlerRunner(self.settings)
        self.round = 0
        self.spider_name = spider_name

    def get_spider_class(self, spider_name: str):
        spider_module = importlib.import_module('evascrapy.spiders.' +
                                                spider_name + '_spider')
        spider_class = None
        for name, spider_member in inspect.getmembers(spider_module):
            if inspect.isclass(spider_member) \
                    and issubclass(spider_member, CrawlSpider) \
                    and hasattr(spider_member, 'name') \
                    and spider_member.name:
                spider_class = spider_member
                break
        return spider_class

    def schedule(self):
        scheduler = TwistedScheduler(
            {'apscheduler.timezone': self.settings.get('APP_TIMEZONE')})

        # TODO: use random interval
        switch = {
            'debug':
            lambda: scheduler.add_job(self.run_crawler, 'interval', seconds=3),
            'hourly':
            lambda: scheduler.add_job(
                self.run_crawler, 'interval', seconds=3600),
            'daily':
            lambda: scheduler.add_job(
                self.run_crawler, 'interval', seconds=86400),
            'weekly':
            lambda: scheduler.add_job(
                self.run_crawler, 'interval', seconds=86400 * 7),
            'monthly':
            lambda: scheduler.add_job(
                self.run_crawler, 'interval', seconds=86400 * 30),
        }

        switch[self.settings.get('APP_CRAWL_INTERVAL')]()
        scheduler.start()

    def run_crawler(self):
        spider_class = self.get_spider_class(self.spider_name)

        if os.getenv('APP_DISTRIBUTED'):
            redis = get_redis(url=self.crawler.settings.get('REDIS_URL'))

        if len(list(self.crawler.crawlers)) < 1:
            self.crawler.settings.set(
                'APP_TASK',
                ScheduleCrawlerRunner.interval_to_app_task(
                    self.crawler.settings.get('APP_STORAGE_SHUFFLE_INTERVAL')))

            if os.getenv('APP_DISTRIBUTED'):
                if redis.zcount(spider_class.name + ':requests', 0, 100) < 1:
                    for start_url in spider_class.start_urls:
                        redis.sadd(spider_class.name + ':start_urls',
                                   start_url)
                else:
                    self.crawler.settings.set(
                        'APP_TASK',
                        redis.get(spider_class.name +
                                  ':app_task').decode('utf-8'))

            logger.info(
                '[SPIDER.%s.%s.DIS_%s.ROUND_%s] started, APP_CRAWL_INTERVAL: %s, APP_STORAGE_SHUFFLE_INTERVAL: %s',
                spider_class.name, self.crawler.settings.get('APP_TASK'),
                os.getenv('APP_DISTRIBUTED'), self.round,
                self.crawler.settings.get('APP_CRAWL_INTERVAL'),
                self.crawler.settings.get('APP_STORAGE_SHUFFLE_INTERVAL'))
            self.crawler.crawl(spider_class)
            if os.getenv('APP_DISTRIBUTED'):
                redis.set(spider_class.name + ':app_task',
                          self.crawler.settings.get('APP_TASK'))
            self.round += 1
        else:
            logger.info('NEW ROUND SKIPPED BY [SPIDER.%s.%s.DIS_%s.ROUND_%s]',
                        spider_class.name,
                        self.crawler.settings.get('APP_TASK'),
                        os.getenv('APP_DISTRIBUTED'), self.round)

    def start(self):
        reactor.run()
Пример #49
0
    # get current working directory
    cwd = str(pathlib.Path().absolute())
    # set path in which to store images
    settings.set('IMAGES_STORE', cwd + '\\')
    settings.set(
        'IMAGE_URL_FIELDS', {
            'white': {
                'name_field': 'title',
                'sub_folder': 'white',
                'path_field': 'white_path',
            },
            'black': {
                'name_field': 'title',
                'sub_folder': 'black',
                'path_field': 'black_path',
            }
        })
    # enable the pipeline
    settings.set('ITEM_PIPELINES', {'pipeline.ImageNamePipeline': 200})
    return settings


if __name__ == '__main__':
    # routine to run scrapy from a script
    # see: https://docs.scrapy.org/en/latest/topics/practices.html#run-scrapy-from-a-script
    settings = get_settings()
    runner = CrawlerRunner(settings)
    d = runner.crawl(CatSpider)
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
Пример #50
0
def run():
    configure_logging()
    runner = CrawlerRunner(get_project_settings())
    d = runner.crawl(HMSpider)
    d.addBoth(lambda _: reactor.stop())
    reactor.run()  # the script will block here until the crawling is finished

class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        url = 'http://quotes.toscrape.com/'
        yield scrapy.Request(url, self.parse)

    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').get(),
                'author': quote.css('small.author::text').get(),
            }

        next_page = response.css('li.next a::attr(href)').get()
        if next_page is not None:
            yield response.follow(next_page, self.parse)


configure_logging()
runner = CrawlerRunner()
runner.crawl(AuthorSpider)
runner.crawl(QuotesSpider)
d = runner.join()
d.addBoth(lambda _: reactor.stop())

reactor.run(
)  # the script will block here until all crawling jobs are finished
import os
import sys, os.path, io
import time

sys.path.append('D:/home/python364x86/Lib/site-packages')

from twisted.internet import reactor
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from hotelSpider import HotelSpider

urlList = []
urlList = sys.argv[1].split(",")
limit = int(sys.argv[2])

newUrlList = []
for url in urlList:
    if (len(url) > 1):
        newUrlList.append(url)

configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner()

d = runner.crawl(HotelSpider, newUrlList, limit)
d.addBoth(lambda _: reactor.stop())
reactor.run()
Пример #53
0
#! /usr/bin/env python
# -*- coding:UTF-8 -*-
# 同一个进程同时运行多个爬虫

from twisted.internet import reactor

import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings

configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner(get_project_settings())

runner.crawl('quotes')
runner.crawl('author')
d = runner.join()
d.addBoth(lambda _: reactor.stop())

reactor.run()
Пример #54
0
                if case.check(task, maps):
                    spiders.append(case.spider)
                    break
                else:
                    # crawler_logger.warning('spider task {} is an invalid task, the allowed tasks are {}'.format(
                    #     task, list(maps.keys())))
                    pass
    if not spiders:
        # crawler_logger.warning('no spider starts up, please check your task input')
        return

    settings = get_project_settings()
    configure_logging(settings)
    runner = CrawlerRunner(settings)
    for spider in spiders:
        runner.crawl(spider)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()


@click.command()
@click.option('--usage', default='https', help='Usage of squid')
@click.option('--interval',
              default=TTL_VALIDATED_RESOURCE,
              help='Updating frenquency of squid conf.')
def squid_conf_update(usage, interval):
    """Timertask for updating proxies for squid config file"""
    # client_logger.info('the updating task is starting...')
    client = SquidClient(usage)
    client.update_conf()
Пример #55
0
# coding=utf-8
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
from twisted.internet import reactor

from engine.spiders.BaiduSpider import BaiduSpider
from engine.spiders.SpidersList import AssistRedisSpider
# configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
# runner = CrawlerRunner(get_project_settings())
#
# d = runner.crawl(TopSpider)
# d.addBoth(lambda _: reactor.stop())
# reactor.run()

configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner(get_project_settings())

d = runner.crawl(BaiduSpider)
d.addBoth(lambda _: reactor.stop())
reactor.run()

# configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
# runner = CrawlerRunner(get_project_settings())
# d = runner.crawl(RedisSpider)
# d.addBoth(lambda _: reactor.stop())
# reactor.run()
Пример #56
0
# -*-coding:utf-8-*-
'''
Created on 2015年8月30日

@author: yx
'''
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerProcess
import multiprocessing
from get_url import get_url
import time

if __name__ == '__main__':
    runner = CrawlerRunner(get_project_settings())
    d = runner.crawl('comment_scrapy')
    d.addBoth(lambda _: reactor.stop())
    reactor.run()  # the script will block here until the crawling is finished
    print "get once!!"
    get_url()
    print "update url!!" + ' at ' + time.strftime("%Y-%m-%d %H:%M",
                                                  time.localtime())
Пример #57
0
    def search(self):
        print(self.user)
        print(__name__)
        #print(user, '3')

        #process = CrawlerProcess(get_project_settings())
        #process.crawl('JumpReport')
        #process.start()
        #process.stop()
        #process.put()
        # 脚本执行爬虫代码
        runner = CrawlerRunner(get_project_settings())

        #def search(runner, keyword):
        #    return runner.crawl(JumpReport, keyword)

        #runner = CrawlerProcess()
        #dfs = set()
        print('a')
        runner.crawl('JumpReport', user=self.user)
        print(self.user)
        d = runner.join()
        #dfs.add(d)
        #defer.DeferredList(dfs).addBoth(lambda _: reactor.stop())
        d.addBoth(lambda _: reactor.stop())
        #search(runner, "abcd")
        #search(runner, "beat")
        #runner.start()
        reactor.run()  # 阻塞运行爬虫

        print("complete")

        # runner = CrawlerRunner(get_project_settings())
        # dfs = set()
        # for domain in range(2):
        #     d = runner.crawl('JumpReport')
        #     dfs.add(d)
        #
        # defer.DeferredList(dfs).addBoth(lambda _: reactor.stop())
        # reactor.run()  # the script will block here until all crawling jobs are finished

        # runner = CrawlerRunner(get_project_settings())
        #
        # @defer.inlineCallbacks
        # def crawl():
        #     for domain in range(2):
        #         yield runner.crawl('JumpReport')
        #     reactor.stop()
        #
        # crawl()
        # reactor.run()  # the script will block here until the last crawl call is finished

        # settings = Settings({'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'})
        # runner = CrawlerRunner(settings)
        #
        # d = runner.crawl(JumpReport)
        # d.addBoth(lambda _: reactor.stop())
        # reactor.run() # the script will block here until the crawling is finished

        # runner = CrawlerProcess(get_project_settings())
        # runner.crawl(JumpReport)
        # runner.start()

        name = self.qle.text()
        db = db_handle()
        with db as con:
            sql = "select * from player where name = '{}' order by update_time".format(
                name)
            con.execute(sql)
            player = con.fetchone()
            if player:
                id, name, win, match_count, strength, level, update_time, rank = player
                text = "角色名:  {}\n胜场:    {}\n总场数:  {}\n团分:    {}\n团分排行: {}\n等级:    {}\n更新时间: {}".format(
                    name, win, match_count, strength, rank, level, update_time)

                self.txt.setText(text)

            sql = "select * from player_data where name = '{}' order by date".format(
                name)
            con.execute(sql)
            player_data = con.fetchall()
            a = ""
            for data in player_data:
                a += str(data)
                a += "\n"
            self.battle.setText(str(a))

            sql = "select * from game_data order by match_id desc"
            con.execute(sql)
            game_data = con.fetchall()
            a = ""
            l = 0
            self.battle_table.setRowCount(len(game_data))
            for data in game_data:
                a += str(data[1:])
                print(type(data))

                for i in range(self.battle_table.columnCount()):

                    item = QTableWidgetItem(str(data[i + 1]))
                    # 设置填入数据的排列位置(左右居中| 上下居中)
                    item.setTextAlignment(Qt.AlignHCenter | Qt.AlignVCenter)
                    self.battle_table.setItem(l, i, item)

                a += "\n"
                self.player_status.setText(str(a))
                l += 1
class CrawlTestCase(TestCase):
    def setUp(self):
        self.mockserver = MockServer()
        self.mockserver.__enter__()
        self.runner = CrawlerRunner()

    def tearDown(self):
        self.mockserver.__exit__(None, None, None)

    @defer.inlineCallbacks
    def test_follow_all(self):
        crawler = self.runner.create_crawler(FollowAllSpider)
        yield crawler.crawl(mockserver=self.mockserver)
        self.assertEqual(len(crawler.spider.urls_visited),
                         11)  # 10 + start_url

    @defer.inlineCallbacks
    def test_delay(self):
        # short to long delays
        yield self._test_delay(0.2, False)
        yield self._test_delay(1, False)
        # randoms
        yield self._test_delay(0.2, True)
        yield self._test_delay(1, True)

    @defer.inlineCallbacks
    def _test_delay(self, delay, randomize):
        settings = {
            "DOWNLOAD_DELAY": delay,
            'RANDOMIZE_DOWNLOAD_DELAY': randomize
        }
        crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider)
        yield crawler.crawl(maxlatency=delay * 2, mockserver=self.mockserver)
        t = crawler.spider.times
        totaltime = t[-1] - t[0]
        avgd = totaltime / (len(t) - 1)
        tolerance = 0.6 if randomize else 0.2
        self.assertTrue(avgd > delay * (1 - tolerance),
                        "download delay too small: %s" % avgd)

    @defer.inlineCallbacks
    def test_timeout_success(self):
        crawler = self.runner.create_crawler(DelaySpider)
        yield crawler.crawl(n=0.5, mockserver=self.mockserver)
        self.assertTrue(crawler.spider.t1 > 0)
        self.assertTrue(crawler.spider.t2 > 0)
        self.assertTrue(crawler.spider.t2 > crawler.spider.t1)

    @defer.inlineCallbacks
    def test_timeout_failure(self):
        crawler = CrawlerRunner({
            "DOWNLOAD_TIMEOUT": 0.35
        }).create_crawler(DelaySpider)
        yield crawler.crawl(n=0.5, mockserver=self.mockserver)
        self.assertTrue(crawler.spider.t1 > 0)
        self.assertTrue(crawler.spider.t2 == 0)
        self.assertTrue(crawler.spider.t2_err > 0)
        self.assertTrue(crawler.spider.t2_err > crawler.spider.t1)
        # server hangs after receiving response headers
        yield crawler.crawl(n=0.5, b=1, mockserver=self.mockserver)
        self.assertTrue(crawler.spider.t1 > 0)
        self.assertTrue(crawler.spider.t2 == 0)
        self.assertTrue(crawler.spider.t2_err > 0)
        self.assertTrue(crawler.spider.t2_err > crawler.spider.t1)

    @defer.inlineCallbacks
    def test_retry_503(self):
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as l:
            yield crawler.crawl(self.mockserver.url("/status?n=503"),
                                mockserver=self.mockserver)
        self._assert_retried(l)

    @defer.inlineCallbacks
    def test_retry_conn_failed(self):
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as l:
            yield crawler.crawl("http://localhost:65432/status?n=503",
                                mockserver=self.mockserver)
        self._assert_retried(l)

    @defer.inlineCallbacks
    def test_retry_dns_error(self):
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as l:
            # try to fetch the homepage of a non-existent domain
            yield crawler.crawl("http://dns.resolution.invalid./",
                                mockserver=self.mockserver)
        self._assert_retried(l)

    @defer.inlineCallbacks
    def test_start_requests_bug_before_yield(self):
        with LogCapture('scrapy', level=logging.ERROR) as l:
            crawler = self.runner.create_crawler(BrokenStartRequestsSpider)
            yield crawler.crawl(fail_before_yield=1,
                                mockserver=self.mockserver)

        self.assertEqual(len(l.records), 1)
        record = l.records[0]
        self.assertIsNotNone(record.exc_info)
        self.assertIs(record.exc_info[0], ZeroDivisionError)

    @defer.inlineCallbacks
    def test_start_requests_bug_yielding(self):
        with LogCapture('scrapy', level=logging.ERROR) as l:
            crawler = self.runner.create_crawler(BrokenStartRequestsSpider)
            yield crawler.crawl(fail_yielding=1, mockserver=self.mockserver)

        self.assertEqual(len(l.records), 1)
        record = l.records[0]
        self.assertIsNotNone(record.exc_info)
        self.assertIs(record.exc_info[0], ZeroDivisionError)

    @defer.inlineCallbacks
    def test_start_requests_lazyness(self):
        settings = {"CONCURRENT_REQUESTS": 1}
        crawler = CrawlerRunner(settings).create_crawler(
            BrokenStartRequestsSpider)
        yield crawler.crawl(mockserver=self.mockserver)
        #self.assertTrue(False, crawler.spider.seedsseen)
        #self.assertTrue(crawler.spider.seedsseen.index(None) < crawler.spider.seedsseen.index(99),
        #                crawler.spider.seedsseen)

    @defer.inlineCallbacks
    def test_start_requests_dupes(self):
        settings = {"CONCURRENT_REQUESTS": 1}
        crawler = CrawlerRunner(settings).create_crawler(
            DuplicateStartRequestsSpider)
        yield crawler.crawl(dont_filter=True,
                            distinct_urls=2,
                            dupe_factor=3,
                            mockserver=self.mockserver)
        self.assertEqual(crawler.spider.visited, 6)

        yield crawler.crawl(dont_filter=False,
                            distinct_urls=3,
                            dupe_factor=4,
                            mockserver=self.mockserver)
        self.assertEqual(crawler.spider.visited, 3)

    @defer.inlineCallbacks
    def test_unbounded_response(self):
        # Completeness of responses without Content-Length or Transfer-Encoding
        # can not be determined, we treat them as valid but flagged as "partial"
        from six.moves.urllib.parse import urlencode
        query = urlencode({
            'raw':
            '''\
HTTP/1.1 200 OK
Server: Apache-Coyote/1.1
X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0
Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/
Pragma: no-cache
Expires: Thu, 01 Jan 1970 00:00:00 GMT
Cache-Control: no-cache
Cache-Control: no-store
Content-Type: text/html;charset=UTF-8
Content-Language: en
Date: Tue, 27 Aug 2013 13:05:05 GMT
Connection: close

foo body
with multiples lines
'''
        })
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as l:
            yield crawler.crawl(self.mockserver.url("/raw?{0}".format(query)),
                                mockserver=self.mockserver)
        self.assertEqual(str(l).count("Got response 200"), 1)

    @defer.inlineCallbacks
    def test_retry_conn_lost(self):
        # connection lost after receiving data
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as l:
            yield crawler.crawl(self.mockserver.url("/drop?abort=0"),
                                mockserver=self.mockserver)
        self._assert_retried(l)

    @defer.inlineCallbacks
    def test_retry_conn_aborted(self):
        # connection lost before receiving data
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as l:
            yield crawler.crawl(self.mockserver.url("/drop?abort=1"),
                                mockserver=self.mockserver)
        self._assert_retried(l)

    def _assert_retried(self, log):
        self.assertEqual(str(log).count("Retrying"), 2)
        self.assertEqual(str(log).count("Gave up retrying"), 1)

    @defer.inlineCallbacks
    def test_referer_header(self):
        """Referer header is set by RefererMiddleware unless it is already set"""
        req0 = Request(self.mockserver.url('/echo?headers=1&body=0'),
                       dont_filter=1)
        req1 = req0.replace()
        req2 = req0.replace(headers={'Referer': None})
        req3 = req0.replace(headers={'Referer': 'http://example.com'})
        req0.meta['next'] = req1
        req1.meta['next'] = req2
        req2.meta['next'] = req3
        crawler = self.runner.create_crawler(SingleRequestSpider)
        yield crawler.crawl(seed=req0, mockserver=self.mockserver)
        # basic asserts in case of weird communication errors
        self.assertIn('responses', crawler.spider.meta)
        self.assertNotIn('failures', crawler.spider.meta)
        # start requests doesn't set Referer header
        echo0 = json.loads(to_unicode(
            crawler.spider.meta['responses'][2].body))
        self.assertNotIn('Referer', echo0['headers'])
        # following request sets Referer to start request url
        echo1 = json.loads(to_unicode(
            crawler.spider.meta['responses'][1].body))
        self.assertEqual(echo1['headers'].get('Referer'), [req0.url])
        # next request avoids Referer header
        echo2 = json.loads(to_unicode(
            crawler.spider.meta['responses'][2].body))
        self.assertNotIn('Referer', echo2['headers'])
        # last request explicitly sets a Referer header
        echo3 = json.loads(to_unicode(
            crawler.spider.meta['responses'][3].body))
        self.assertEqual(echo3['headers'].get('Referer'),
                         ['http://example.com'])

    @defer.inlineCallbacks
    def test_engine_status(self):
        from scrapy.utils.engine import get_engine_status
        est = []

        def cb(response):
            est.append(get_engine_status(crawler.engine))

        crawler = self.runner.create_crawler(SingleRequestSpider)
        yield crawler.crawl(seed=self.mockserver.url('/'),
                            callback_func=cb,
                            mockserver=self.mockserver)
        self.assertEqual(len(est), 1, est)
        s = dict(est[0])
        self.assertEqual(s['engine.spider.name'], crawler.spider.name)
        self.assertEqual(s['len(engine.scraper.slot.active)'], 1)

    @defer.inlineCallbacks
    def test_graceful_crawl_error_handling(self):
        """
        Test whether errors happening anywhere in Crawler.crawl() are properly
        reported (and not somehow swallowed) after a graceful engine shutdown.
        The errors should not come from within Scrapy's core but from within
        spiders/middlewares/etc., e.g. raised in Spider.start_requests(),
        SpiderMiddleware.process_start_requests(), etc.
        """
        class TestError(Exception):
            pass

        class FaultySpider(SimpleSpider):
            def start_requests(self):
                raise TestError

        crawler = self.runner.create_crawler(FaultySpider)
        yield self.assertFailure(crawler.crawl(mockserver=self.mockserver),
                                 TestError)
        self.assertFalse(crawler.crawling)

    @defer.inlineCallbacks
    def test_open_spider_error_on_faulty_pipeline(self):
        settings = {
            "ITEM_PIPELINES": {
                "tests.pipelines.ZeroDivisionErrorPipeline": 300,
            }
        }
        crawler = CrawlerRunner(settings).create_crawler(SimpleSpider)
        yield self.assertFailure(
            self.runner.crawl(crawler,
                              self.mockserver.url("/status?n=200"),
                              mockserver=self.mockserver), ZeroDivisionError)
        self.assertFalse(crawler.crawling)

    @defer.inlineCallbacks
    def test_crawlerrunner_accepts_crawler(self):
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as log:
            yield self.runner.crawl(crawler,
                                    self.mockserver.url("/status?n=200"),
                                    mockserver=self.mockserver)
        self.assertIn("Got response 200", str(log))

    @defer.inlineCallbacks
    def test_crawl_multiple(self):
        self.runner.crawl(SimpleSpider,
                          self.mockserver.url("/status?n=200"),
                          mockserver=self.mockserver)
        self.runner.crawl(SimpleSpider,
                          self.mockserver.url("/status?n=503"),
                          mockserver=self.mockserver)

        with LogCapture() as log:
            yield self.runner.join()

        self._assert_retried(log)
        self.assertIn("Got response 200", str(log))
from SWSpider import *
from SWSpider.spiders.sw_spider import *
import SWSpider.settings

from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings

east_test = ['DTW', 'BOS', 'DCA', 'FLL', 'MDW']
runner = CrawlerRunner(get_project_settings())
dfs = set()
f = open('east_prices', 'wb')
f.write('')
for x in xrange(0, len(east_test)):
    for y in xrange(0, len(east_test)):
        if x == y:
            continue
        d = runner.crawl(
            'sw_spider',
            depCity=east_test[x],
            arrCity=east_test[y],
            x=x,
            y=y,
            filename='east_prices')
        dfs.add(d)

defer.DeferredList(dfs).addBoth(lambda _: reactor.stop())
# the script will block here until all crawling jobs are finished
reactor.run()
Пример #60
0
    def run_spider(self):
        """
        Rather than using scrapyd or executing the spider manually via scrapy,
        this method creates a CrawlerRunnerand runs the spider provided at
        construction.

        https://doc.scrapy.org/en/latest/topics/practices.html#run-from-script
        http://twistedmatrix.com/trac/wiki/FrequentlyAskedQuestions#Igetexceptions.ValueError:signalonlyworksinmainthreadwhenItrytorunmyTwistedprogramWhatswrong
        """
        try:

            self.set_stage(ScraperStage.CRAWLING)
            self.start_time = time.time()

            # post debug message to slack
            if self.debug_slack:
                self.handle_slack_message(DEBUG_SLACK_CHANNEL,
                                          'Starting scraper ' + self.name)

            runner = CrawlerRunner({
                'USER_AGENT':
                get_random_user_agent.get_random_user_agent(),
                'FEED_FORMAT':
                'json',
                'FEED_URI':
                self.new_file_name,
                'AUTOTHROTTLE_ENABLED':
                'True',
                'DUPEFILTER_DEBUG':
                'True'
            })

            #runner.signals.connect(self.handle_spider_close, signals.spider_closed)

            # todo deferred spider or something like
            # https://kirankoduru.github.io/python/multiple-scrapy-spiders.html
            _d = runner.crawl(self.spider)

            # stop the reactor when we're done
            _d.addBoth(lambda _: reactor.stop())
            # http://twistedmatrix.com/documents/9.0.0/core/howto/deferredindepth.html#auto7
            # https://twistedmatrix.com/documents/17.9.0/api/twisted.internet.defer.Deferred.html
            signal.signal(signal.SIGINT, self.terminate)
            signal.signal(signal.SIGTERM, self.terminate)

            # crawler = Crawler(self.spider, {
            #     'USER_AGENT': get_random_user_agent.get_random_user_agent(),
            #     'FEED_FORMAT': 'json',
            #     'FEED_URI': self.new_file_name,
            #     'AUTOTHROTTLE_ENABLED': 'True',
            #     'DUPEFILTER_DEBUG': 'True'
            # })
            # crawler.signals.connect(self.handle_spider_close, signal=signals.spider_closed)
            #
            # deferred = crawler.crawl()
            # deferred.addBoth(lambda _: self.handle_spider_done)

            reactor.run()

            return True, None

        except KeyboardInterrupt:
            raise KeyboardInterrupt("KeyboardInterrupt caught in run")
        except Exception as _e:
            exc_type, exc_value, exec_tb = sys.exc_info()
            return False, 'Caught ' \
                   + str("".join(traceback.format_exception(exc_type, exc_value, exec_tb)))