Пример #1
0
class CrawlerRunnerTest(unittest.TestCase):

    def setUp(self):
        self.crawler_runner = CrawlerRunner(Settings())

    def tearDown(self):
        return self.crawler_runner.stop()

    @defer.inlineCallbacks
    def test_populate_spidercls_settings(self):
        spider_settings = {'TEST1': 'spider', 'TEST2': 'spider'}
        project_settings = {'TEST1': 'project', 'TEST3': 'project'}

        class CustomSettingsSpider(DefaultSpider):
            custom_settings = spider_settings

        self.crawler_runner.settings.setdict(project_settings,
                                             priority='project')

        d = self.crawler_runner.crawl(CustomSettingsSpider)
        crawler = list(self.crawler_runner.crawlers)[0]
        yield d
        self.assertEqual(crawler.settings.get('TEST1'), 'spider')
        self.assertEqual(crawler.settings.get('TEST2'), 'spider')
        self.assertEqual(crawler.settings.get('TEST3'), 'project')
Пример #2
0
 def handle_lj(self):
     configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
     runner = CrawlerRunner(crawler_setting)
     #d = runner.crawl(HouseSpider)
     d = runner.crawl(LianjiaHouseSpider)
     d.addBoth(lambda _: reactor.stop())
     reactor.run()
Пример #3
0
def crawler_start(usage, tasks):
    """Start specified spiders or validators from cmd with scrapy core api.
    There are four kinds of spiders: common, ajax, gfw, ajax_gfw. If you don't
    assign any tasks, all these spiders will run.
    """
    maps = CRAWLER_TASK_MAPS if usage == 'crawler' else TEMP_TASK_MAPS
    origin_spiders = DEFAULT_CRAWLERS if usage == 'crawler' else DEFAULT_VALIDATORS
    if not tasks:
        spiders = origin_spiders
    else:
        spiders = list()
        cases = list(map(BaseCase, origin_spiders))
        for task in tasks:
            for case in cases:
                if case.check(task, maps):
                    spiders.append(case.spider)
                    break
            else:
                # crawler_logger.warning('spider task {} is an invalid task, the allowed tasks are {}'.format(
                #     task, list(maps.keys())))
                pass
    if not spiders:
        #crawler_logger.warning('no spider starts up, please check your task input')
        return

    settings = get_project_settings()
    configure_logging(settings)
    runner = CrawlerRunner(settings)
    for spider in spiders:
        runner.crawl(spider)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
Пример #4
0
    def test_same_url(self):

        class TestSameUrlSpider(Spider):
            name = 'test_same_url'

            def __init__(self, *args, **kwargs):
                super(TestSameUrlSpider, self).__init__(*args, **kwargs)
                self.visited = 0

            def start_requests(s):
                return self.conman.from_spider(s, self.results)

            def parse_first(self, response):
                self.visited += 1
                return TestItem()

            def parse_second(self, response):
                self.visited += 1
                return TestItem()

        with MockServer() as mockserver:
            contract_doc = '@url {}'.format(mockserver.url('/status?n=200'))

            get_unbound_function(TestSameUrlSpider.parse_first).__doc__ = contract_doc
            get_unbound_function(TestSameUrlSpider.parse_second).__doc__ = contract_doc

            crawler = CrawlerRunner().create_crawler(TestSameUrlSpider)
            yield crawler.crawl()

        self.assertEqual(crawler.spider.visited, 2)
Пример #5
0
    def test_crawler_process(self):
        runner = CrawlerRunner(self.settings)
        d = runner.crawl(CustomSpider)
        d.addBoth(lambda _: reactor.stop())
        # add crawl to redis
        key = "test-spider:istresearch.com:queue"
        self.redis_conn.zadd(key, self.example_feed, -99)

        # run the spider, give 20 seconds to see the url, crawl it,
        # and send to kafka. Then we kill the reactor
        def thread_func():
            time.sleep(20)
            reactor.stop()

        thread = threading.Thread(target=thread_func)
        thread.start()
        reactor.run()

        message_count = 0
        m = next(self.consumer)

        if m is None:
            pass
        else:
            the_dict = json.loads(m.value)
            if the_dict is not None and the_dict['appid'] == 'test' \
                    and the_dict['crawlid'] == 'abc12345':
                message_count += 1

        self.assertEquals(message_count, 1)
def run():
    options = {
        'CONCURRENT_ITEMS': 250,
        #'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
        'CONCURRENT_REQUESTS': 30,
        'DOWNLOAD_DELAY': 0.5,
        'COOKIES_ENABLED': False,
        }

    spider = EntertainmentcareersSpider()

    settings = get_project_settings()
    settings.update(options)

    runner= CrawlerRunner(settings)
    runner.crawl(spider)

    d= runner.join()
    d.addBoth(lambda _:reactor.stop())
    #crawler = Crawler(settings)
    #crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    #crawler.install()
    #crawler.configure()
    #crawler.crawl(spider)
    #crawler.start()
    #log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
    reactor.run()
Пример #7
0
def run_spider():
	options = {
	    'CONCURRENT_ITEMS': 250,
	    'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
	    'CONCURRENT_REQUESTS': 30,
	    'DOWNLOAD_DELAY': 0.5,
	    'COOKIES_ENABLED': False,
	    }

	settings = get_project_settings()
	configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
	settings.update(options);

	#BookToscrapeSpider basic version
	from tutorial.spiders.booktoscrape_basic import BookToscrapeSpider
	#runner = CrawlerRunner(settings)
	#runner.crawl(BookToscrapeSpider())

	#BookToscrapeSpider crawl version
	from tutorial.spiders.booktoscrape_crawl import BookToscrapeSpider as BookToscrapeSpider_crawl
	runner = CrawlerRunner(settings)
	runner.crawl(BookToscrapeSpider_crawl())

    #crawler = Crawler(settings)
    #crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    #crawler.install()
    #crawler.configure()
    #crawler.crawl(spider)
    #crawler.start()
    #log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
	d= runner.join()
	d.addBoth(lambda _:reactor.stop())

	reactor.run()
Пример #8
0
def crawl_articles(spids):
    settings = get_project_settings()
    configure_logging(settings, install_root_handler=False)
    logging.getLogger('scrapy').setLevel(logging.WARNING)
    runner = CrawlerRunner(settings)
    loader = runner.spider_loader
    if 'all' in spids:
        spids = loader.list()
    spiders = [
        loader.load(spid)
        for spid in spids
        if spid in loader.list()
    ]
    if not spiders:
        return
    random.shuffle(spiders)
    for spider in spiders:
        runner.crawl(spider)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    logger.info('crawl job starting...')
    try:
        reactor.run()
    except Exception:
        logger.exception('crawl job got exception:')
    logger.info('crawl job finished')
Пример #9
0
def run_crawler_by_runner():
    runner = CrawlerRunner(get_project_settings())
    
    [runner.crawl(spider) for spider in spiders]
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
Пример #10
0
    def test_start_requests_dupes(self):
        settings = {"CONCURRENT_REQUESTS": 1}
        crawler = CrawlerRunner(settings).create_crawler(DuplicateStartRequestsSpider)
        yield crawler.crawl(dont_filter=True, distinct_urls=2, dupe_factor=3, mockserver=self.mockserver)
        self.assertEqual(crawler.spider.visited, 6)

        yield crawler.crawl(dont_filter=False, distinct_urls=3, dupe_factor=4, mockserver=self.mockserver)
        self.assertEqual(crawler.spider.visited, 3)
 def crawl(self):
     spider = Scrapy_ModuleSpider()
     Runner = CrawlerRunner(self.Scrapy_Module_setting)
     cra = Runner.crawl(spider)
     # stop reactor when spider closes
     cra.addBoth(lambda _: self.spider_closing(cra))
     self.logger.info("Run reactor")
     reactor.run()
Пример #12
0
 def runSpider(self, spider):
     configure_logging({'LOG_FORMAT': '%(asctime)s [%(name)s] %(levelname)s: %(message)s'})
     settings = Settings()
     settings.set('FEED_URI', 'output.json')
     settings.set('FEED_FORMAT', 'json')
     
     runner = CrawlerRunner(settings)
     dfd = runner.crawl(spider)
     dfd.addBoth(lambda _: reactor.stop())
Пример #13
0
def get_crawler(spidercls=None, settings_dict=None):
    """Return an unconfigured Crawler object. If settings_dict is given, it
    will be used to populate the crawler settings with a project level
    priority.
    """
    from scrapy.crawler import CrawlerRunner
    from scrapy.spiders import Spider

    runner = CrawlerRunner(settings_dict)
    return runner.create_crawler(spidercls or Spider)
Пример #14
0
 def _test_delay(self, delay, randomize):
     settings = {"DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize}
     crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider)
     yield crawler.crawl(maxlatency=delay * 2, mockserver=self.mockserver)
     t = crawler.spider.times
     totaltime = t[-1] - t[0]
     avgd = totaltime / (len(t) - 1)
     tolerance = 0.6 if randomize else 0.2
     self.assertTrue(avgd > delay * (1 - tolerance),
                     "download delay too small: %s" % avgd)
Пример #15
0
    def test_crawler_runner_loading(self):
        module = 'tests.test_spiderloader.test_spiders.spider1'
        runner = CrawlerRunner({'SPIDER_MODULES': [module]})

        self.assertRaisesRegexp(KeyError, 'Spider not found',
                                runner.create_crawler, 'spider2')

        crawler = runner.create_crawler('spider1')
        self.assertTrue(issubclass(crawler.spidercls, scrapy.Spider))
        self.assertEqual(crawler.spidercls.name, 'spider1')
    def runProcess(self):
        configure_logging()
        dbHandler.check_watches()
        runner = CrawlerRunner()
        runner.crawl(spider.available_courses_spider)
        dbHandler.check_watches()
        d = runner.join()
        d.addBoth(lambda _: reactor.stop())

        reactor.run()
Пример #17
0
def run_login_spider(seed_url, username, password, db_name, logfile = "results.log"):

    init_db(db_name)
    settings = get_project_settings()
    runner = CrawlerRunner(settings)
    d = runner.crawl(LoginFinderSpider, seed_url = seed_url, username = username, password = password)
    d.addBoth(lambda _: reactor.stop())
    log.start(loglevel=log.DEBUG, logfile=logfile)
    log.msg("Item pipelines enabled: %s" % str(settings.get("ITEM_PIPELINES")), level = log.INFO)
    reactor.run()
Пример #18
0
def main():
    locale.setlocale(locale.LC_TIME, 'es_ES')

    configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
    runner = CrawlerRunner()

    d = runner.crawl(LotoSpider)
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
    return None
Пример #19
0
def scrapy_embedding(spidercls):
    settings = get_scrapy_settings()
    # actually we can manually create crawler
    # but CrawlRunner does it more sophisticated and adds support for str
    runner = CrawlerRunner(settings)
    crawler = runner.create_crawler(spidercls)
    crawler.engine = crawler._create_engine()
    crawler.engine.start()

    # log.start(logstdout=False)
    return crawler
Пример #20
0
def runSpider(host, spider):
    spiders = spider.split(',')
    changeSettings(host)
    settings = get_project_settings()
    runner = CrawlerRunner(settings)
    for i in spiders:
        runner.crawl(SPIDER_MATCHER[i.lower()])

    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
Пример #21
0
 def setUp(self):
     settings = Settings()
     settings.setmodule(undercrawler.settings)
     settings['DOWNLOAD_DELAY'] = 0.1
     settings['ITEM_PIPELINES']['tests.utils.CollectorPipeline'] = 100
     splash_url = os.environ.get('SPLASH_URL')
     if splash_url:
         settings['SPLASH_URL'] = splash_url
     settings.update(self.settings)
     runner = CrawlerRunner(settings)
     self.crawler = runner.create_crawler(BaseSpider)
Пример #22
0
    def test_crawler_runner_bootstrap_failed(self):
        runner = CrawlerRunner()

        try:
            yield runner.crawl(ExceptionSpider)
        except ValueError:
            pass
        else:
            self.fail('Exception should be raised from spider')

        self.assertEqual(runner.bootstrap_failed, True)
Пример #23
0
 def crawl(self):
     os.environ['SCRAPY_PROJECT'] = '{0}/{1}'.format(BASE_DIR, 'collector')
     runner = CrawlerRunner({'LOG_LEVEL': 'WARNING',
                             'LOG_FORMATTER': 'collector.collector.utils.PoliteLogFormatter',
                             'ITEM_PIPELINES': {
                                 'collector.collector.pipelines.CodingDuplicatesPipeline': 1,
                                 'collector.collector.pipelines.CodingPriorityPipeline': 2
                             }})
     # runner = CrawlerRunner()
     d = runner.crawl(CodingProjectSpider)
     d.addBoth(lambda _: reactor.stop())
     reactor.run()
Пример #24
0
 def test_timeout_failure(self):
     crawler = CrawlerRunner({"DOWNLOAD_TIMEOUT": 0.35}).create_crawler(DelaySpider)
     yield crawler.crawl(n=0.5, mockserver=self.mockserver)
     self.assertTrue(crawler.spider.t1 > 0)
     self.assertTrue(crawler.spider.t2 == 0)
     self.assertTrue(crawler.spider.t2_err > 0)
     self.assertTrue(crawler.spider.t2_err > crawler.spider.t1)
     # server hangs after receiving response headers
     yield crawler.crawl(n=0.5, b=1, mockserver=self.mockserver)
     self.assertTrue(crawler.spider.t1 > 0)
     self.assertTrue(crawler.spider.t2 == 0)
     self.assertTrue(crawler.spider.t2_err > 0)
     self.assertTrue(crawler.spider.t2_err > crawler.spider.t1)
Пример #25
0
def startprocess(queue):
	runner = CrawlerRunner(get_project_settings())
	dfs = set()
	
	l = runner.crawl('linkspider', website='http://caffe.berkeleyvision.org/', domain='berkeleyvision.org').addCallback(test,queue)
			#回调函数中参数1表示linkspider
	dfs.add(l)
	
	s = runner.crawl('srcspider', website='http://caffe.berkeleyvision.org/', domain='berkeleyvision.org').addCallback(test,queue)#回调函数中参数2表示srcspider
	dfs.add(s)
	c = runner.crawl('codespider', website='http://caffe.berkeleyvision.org/', domain='berkeleyvision.org').addCallback(test,queue)#回调函数中参数3表示codespider
	dfs.add(c)
	defer.DeferredList(dfs).addBoth(lambda _: reactor.stop())
			# the script will block here until all crawling jobs are finished
	reactor.run()
Пример #26
0
 def __init__(self, settings, spec_manager):
     # twisted base class is old-style so we cannot user super()
     Resource.__init__(self)
     self.spec_manager = spec_manager
     settings.set('PLUGINS', [p['bot'] for p in settings.get('PLUGINS')])
     self.runner = CrawlerRunner(settings)
     log.msg("bot initialized", level=log.DEBUG)
Пример #27
0
    def run_and_export(self, spider_cls, settings=None):
        """ Run spider with specified settings; return exported data. """
        tmpdir = tempfile.mkdtemp()
        res_name = tmpdir + "/res"
        defaults = {"FEED_URI": "file://" + res_name, "FEED_FORMAT": "csv"}
        defaults.update(settings or {})
        try:
            with MockServer() as s:
                runner = CrawlerRunner(Settings(defaults))
                yield runner.crawl(spider_cls)

            with open(res_name, "rb") as f:
                defer.returnValue(f.read())

        finally:
            shutil.rmtree(tmpdir)
Пример #28
0
def _run_feed_spider(url, feed):
    spid = str(uuid.uuid4())
    feed['_id'] = spid
    configure_logging(TEST_SETTINGS, install_root_handler=False)
    logging.getLogger('scrapy').setLevel(logging.WARNING)
    save_feed(url)
    cls = SpiderFactory.create_spider(feed)
    runner = CrawlerRunner(TEST_SETTINGS)
    d = runner.crawl(cls)
    d.addBoth(lambda _: reactor.stop())
    reactor.run(installSignalHandlers=False)
    n = get_stats([spid])[spid]
    if n == 0:
        raise Exception(f'feed spider crawled 0 articles')
    if is_exists_spider(url):
        raise Exception(f'feed[{url}] existed')
    del feed['_id']
    save_spider_settings(feed)
Пример #29
0
class Bot(Resource):
    spider = SlydSpider()

    def __init__(self, settings, spec_manager):
        # twisted base class is old-style so we cannot user super()
        Resource.__init__(self)
        self.spec_manager = spec_manager
        settings.set('PLUGINS', [p['bot'] for p in settings.get('PLUGINS')])
        self.runner = CrawlerRunner(settings)
        log.msg("bot initialized", level=log.DEBUG)

    def keep_spider_alive(self, spider):
        raise DontCloseSpider("keeping it open")

    def stop(self):
        """Stop the crawler"""
        self.runner.stop()
        log.msg("bot stopped", level=log.DEBUG)
Пример #30
0
def webcrawl(queue,webs,dom):
	website = ''
	domain = ''
	try:
		runner = CrawlerRunner(get_project_settings())
		dfs = set()
		l = runner.crawl('linkspider', website=webs, domain=dom).addCallback(setflag, queue).addErrback(err, queue)
			#回调函数中参数1表示linkspider
		dfs.add(l)
		s = runner.crawl('srcspider',  website=webs, domain=dom).addCallback(setflag, queue).addErrback(err, queue)#回调函数中参数2表示srcspider
		dfs.add(s)
		c = runner.crawl('codespider', website=webs, domain=dom).addCallback(setflag, queue).addErrback(err, queue)#回调函数中参数3表示codespider
		dfs.add(c)
		defer.DeferredList(dfs).addBoth(lambda _: reactor.stop())
			# the script will block here until all crawling jobs are finished
		reactor.run()
	except Exception,e:
		print e
Пример #31
0
        
        while (c  <= 340) :
            
            keys = response.css("div:nth-child(3)>div:nth-child("+str(c)+")>table td>strong::text").extract()
            values = response.css("div:nth-child(3)>div:nth-child("+str(c)+")>table td::text").extract()
            c+=2
            s = " "
            s = s.join(values)
            
            k = "NA"
            k = k.join(keys)
            
            req_values[k] = s
            #print(keys[0])
            #print(s)
        print(req_values)
        
        
        #req_values['Timestamp'] = dateTimeObj.strftime("%d-%b-%Y (%H:%M:%S.%f)")
        
        
        #print(req_values)
        
        
req_values = dict()
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner()

d = runner.crawl(MySpider)
d.addBoth(lambda _: reactor.stop())
reactor.run() # the script will block here until the crawling is finished
Пример #32
0
    start_urls = ['https://basement.redbull.com/university-api/entrants']

    def parse(self, response):

        result = json.loads(response.text)
        List = []
        registrations = result["registrations"]

        for i in range(len(registrations)):
            if (registrations[i]["country"] == "Bosnia and Herzegovina"):
                List.append(registrations[i])

        sortedList = sorted(List, key=lambda k: k['voteCount'], reverse=True)

        #newList = eval(json.dumps(sortedList))
        print(json.dumps(sortedList))
        sys.stdout.flush()

#with open('data.json','w') as outfile:
#json.dump(sortedList,outfile)
#return sortedList

configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner()

d = runner.crawl(RedbullSpiderSpider)
d.addBoth(lambda _: reactor.stop())
reactor.run()  # the script will block here until the crawling is finished

sleep(1000)
Пример #33
0
def make_crawler(settings, spider_cls=None, **extra_settings):
    settings.update(extra_settings)
    runner = CrawlerRunner(settings)
    return runner.create_crawler(spider_cls or TestSpider)
Пример #34
0
def make_crawler(settings, **extra_settings):
    settings.update(extra_settings)
    runner = CrawlerRunner(settings)
    return runner.create_crawler(BaseSpider)
Пример #35
0
class SpiderParser():
    items = []
    requests = []
    response = None

    def __init__(self, settings, spider, args):
        """
        init parser
        :param settings:
        :param spider:
        :param args:
        """
        self.args = args
        self.spider = spider
        self.crawler_process = CrawlerRunner(settings)
        self.spider_loader = self.crawler_process.spider_loader
        self.spidercls = self.spider_loader.load(self.spider)

    def get_callback(self, request):
        """
        get callback from obj or rules
        :param request:
        :return:
        """
        if getattr(self.spidercls, 'rules', None):
            rules = self.spidercls.rules
            rule_index = request.meta.get('rule', -1)
            if rule_index >= 0 and rule_index < len(rules):
                rule = rules[rule_index]
                return rule.callback
            for rule in rules:
                if rule.link_extractor.matches(request.url):
                    return rule.callback
        return 'parse'

    def run_callback(self, response, cb):
        """
        run callback and get items and requests
        :param response:
        :param cb:
        :return:
        """
        items, requests = [], []
        for x in iterate_spider_output(cb(response)):
            if isinstance(x, (BaseItem, dict)):
                items.append(x)
            elif isinstance(x, Request):
                requests.append(x)
        return items, requests

    def prepare_request(self, spider, request, args):
        """
        get request
        :param spider:
        :param request:
        :param args:
        :return:
        """
        def callback(response):
            """
            callback
            :param response:
            :return:
            """
            request = response.request
            cb = self.args.callback or 'parse'
            if not callable(cb):
                cb_method = getattr(spider, cb, None)
                if callable(cb_method):
                    cb = cb_method
            items, requests = self.run_callback(response, cb)

            # process request callback
            for request in requests:
                request.callback = self.get_callback(request)
                request.meta['callback'] = request.callback
            # process items and requests and response
            self.items += list(map(lambda item: process_item(item), items))
            self.requests += list(
                map(lambda request: process_request(request), requests))
            self.response = process_response(response)

        if args.meta:
            request.meta.update(args.meta)

        # update callback
        request.meta['callback'] = request.callback
        request.callback = callback
        return request

    def run(self):
        """
        run main
        :return:
        """
        request = Request(self.args.url, None)
        start_requests = lambda spider: [
            self.prepare_request(spider, request, self.args)
        ]
        self.spidercls.start_requests = start_requests
        self.crawler_process.crawl(self.spidercls)
        if not len(self.crawler_process.crawlers) > 0:
            return {'ok': False}
        # init pcrawler
        self.pcrawler = list(self.crawler_process.crawlers)[0]
        d = self.crawler_process.join()
        d.addBoth(lambda _: reactor.stop())
        reactor.run()
        return {
            'items': self.items,
            'requests': self.requests,
            'response': self.response,
            'ok': True
        }
Пример #36
0
from jk_en.sendEmail import *
import os
import sys

# sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# print("***********************", os.path.dirname(os.path.abspath(__file__)))
# os.system('scrapy crawl school_jk_spider')

from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from jk_en.spiders import school_jk_apider

configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner()

d = runner.crawl(school_jk_apider.SchoolJkSpiderSpider)
d.addBoth(lambda _: reactor.stop())
reactor.run()

SendEmailMain()
from scrapy.crawler import CrawlerProcess
from spiders.FlatLinkRemodelSpider import FlatLinkRemodelSpider
from spiders.HouseLinkRemodelSpider import HouseLinkRemodelSpider

from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging

configure_logging()
runner = CrawlerRunner()
runner.crawl(FlatLinkRemodelSpider)
runner.crawl(HouseLinkRemodelSpider)

d = runner.join()
d.addBoth(lambda _: reactor.stop())

reactor.run()
Пример #38
0
from flask import Flask, render_template, jsonify, request, redirect, url_for, session, escape
import flask_excel as excel
import time
import datetime

sys.path.insert(0, './hotels/travelData/spiders')

from booking import BookingSpider

app = Flask(__name__)

app.secret_key = "Super_secret_key"

excel.init_excel(app)

crawl_runner = CrawlerRunner()
output_data = []
desCity = ''
checkinDate = ''
checkoutDate = ''
room = 1
traveler = 1


@app.route('/')
def index():
    return render_template("index.html")


# After clicking the Submit Button FLASK will come into this
@app.route('/', methods=['POST'])
Пример #39
0
 def __init__(self):
     self.output = {}
     #self.runner = CrawlerProcess(settings={'LOG_ENABLED': False})
     self.runner = CrawlerRunner(settings={'LOG_ENABLED': False})
Пример #40
0
class CrawlTestCase(TestCase):
    def setUp(self):
        self.mockserver = MockServer()
        self.mockserver.__enter__()
        self.runner = CrawlerRunner()

    def tearDown(self):
        self.mockserver.__exit__(None, None, None)

    @defer.inlineCallbacks
    def test_follow_all(self):
        crawler = self.runner.create_crawler(FollowAllSpider)
        yield crawler.crawl(mockserver=self.mockserver)
        self.assertEqual(len(crawler.spider.urls_visited),
                         11)  # 10 + start_url

    @defer.inlineCallbacks
    def test_fixed_delay(self):
        yield self._test_delay(total=3, delay=0.2)

    @defer.inlineCallbacks
    def test_randomized_delay(self):
        yield self._test_delay(total=3, delay=0.1, randomize=True)

    @defer.inlineCallbacks
    def _test_delay(self, total, delay, randomize=False):
        crawl_kwargs = dict(
            maxlatency=delay * 2,
            mockserver=self.mockserver,
            total=total,
        )
        tolerance = (1 - (0.6 if randomize else 0.2))

        settings = {
            "DOWNLOAD_DELAY": delay,
            'RANDOMIZE_DOWNLOAD_DELAY': randomize
        }
        crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider)
        yield crawler.crawl(**crawl_kwargs)
        times = crawler.spider.times
        total_time = times[-1] - times[0]
        average = total_time / (len(times) - 1)
        self.assertTrue(average > delay * tolerance,
                        f"download delay too small: {average}")

        # Ensure that the same test parameters would cause a failure if no
        # download delay is set. Otherwise, it means we are using a combination
        # of ``total`` and ``delay`` values that are too small for the test
        # code above to have any meaning.
        settings["DOWNLOAD_DELAY"] = 0
        crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider)
        yield crawler.crawl(**crawl_kwargs)
        times = crawler.spider.times
        total_time = times[-1] - times[0]
        average = total_time / (len(times) - 1)
        self.assertFalse(average > delay / tolerance,
                         "test total or delay values are too small")

    @defer.inlineCallbacks
    def test_timeout_success(self):
        crawler = self.runner.create_crawler(DelaySpider)
        yield crawler.crawl(n=0.5, mockserver=self.mockserver)
        self.assertTrue(crawler.spider.t1 > 0)
        self.assertTrue(crawler.spider.t2 > 0)
        self.assertTrue(crawler.spider.t2 > crawler.spider.t1)

    @defer.inlineCallbacks
    def test_timeout_failure(self):
        crawler = CrawlerRunner({
            "DOWNLOAD_TIMEOUT": 0.35
        }).create_crawler(DelaySpider)
        yield crawler.crawl(n=0.5, mockserver=self.mockserver)
        self.assertTrue(crawler.spider.t1 > 0)
        self.assertTrue(crawler.spider.t2 == 0)
        self.assertTrue(crawler.spider.t2_err > 0)
        self.assertTrue(crawler.spider.t2_err > crawler.spider.t1)
        # server hangs after receiving response headers
        yield crawler.crawl(n=0.5, b=1, mockserver=self.mockserver)
        self.assertTrue(crawler.spider.t1 > 0)
        self.assertTrue(crawler.spider.t2 == 0)
        self.assertTrue(crawler.spider.t2_err > 0)
        self.assertTrue(crawler.spider.t2_err > crawler.spider.t1)

    @defer.inlineCallbacks
    def test_retry_503(self):
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as log:
            yield crawler.crawl(self.mockserver.url("/status?n=503"),
                                mockserver=self.mockserver)
        self._assert_retried(log)

    @defer.inlineCallbacks
    def test_retry_conn_failed(self):
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as log:
            yield crawler.crawl("http://localhost:65432/status?n=503",
                                mockserver=self.mockserver)
        self._assert_retried(log)

    @defer.inlineCallbacks
    def test_retry_dns_error(self):
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as log:
            # try to fetch the homepage of a non-existent domain
            yield crawler.crawl("http://dns.resolution.invalid./",
                                mockserver=self.mockserver)
        self._assert_retried(log)

    @defer.inlineCallbacks
    def test_start_requests_bug_before_yield(self):
        with LogCapture('scrapy', level=logging.ERROR) as log:
            crawler = self.runner.create_crawler(BrokenStartRequestsSpider)
            yield crawler.crawl(fail_before_yield=1,
                                mockserver=self.mockserver)

        self.assertEqual(len(log.records), 1)
        record = log.records[0]
        self.assertIsNotNone(record.exc_info)
        self.assertIs(record.exc_info[0], ZeroDivisionError)

    @defer.inlineCallbacks
    def test_start_requests_bug_yielding(self):
        with LogCapture('scrapy', level=logging.ERROR) as log:
            crawler = self.runner.create_crawler(BrokenStartRequestsSpider)
            yield crawler.crawl(fail_yielding=1, mockserver=self.mockserver)

        self.assertEqual(len(log.records), 1)
        record = log.records[0]
        self.assertIsNotNone(record.exc_info)
        self.assertIs(record.exc_info[0], ZeroDivisionError)

    @defer.inlineCallbacks
    def test_start_requests_lazyness(self):
        settings = {"CONCURRENT_REQUESTS": 1}
        crawler = CrawlerRunner(settings).create_crawler(
            BrokenStartRequestsSpider)
        yield crawler.crawl(mockserver=self.mockserver)
        self.assertTrue(
            crawler.spider.seedsseen.index(None) <
            crawler.spider.seedsseen.index(99), crawler.spider.seedsseen)

    @defer.inlineCallbacks
    def test_start_requests_dupes(self):
        settings = {"CONCURRENT_REQUESTS": 1}
        crawler = CrawlerRunner(settings).create_crawler(
            DuplicateStartRequestsSpider)
        yield crawler.crawl(dont_filter=True,
                            distinct_urls=2,
                            dupe_factor=3,
                            mockserver=self.mockserver)
        self.assertEqual(crawler.spider.visited, 6)

        yield crawler.crawl(dont_filter=False,
                            distinct_urls=3,
                            dupe_factor=4,
                            mockserver=self.mockserver)
        self.assertEqual(crawler.spider.visited, 3)

    @defer.inlineCallbacks
    def test_unbounded_response(self):
        # Completeness of responses without Content-Length or Transfer-Encoding
        # can not be determined, we treat them as valid but flagged as "partial"
        from urllib.parse import urlencode
        query = urlencode({
            'raw':
            '''\
HTTP/1.1 200 OK
Server: Apache-Coyote/1.1
X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0
Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/
Pragma: no-cache
Expires: Thu, 01 Jan 1970 00:00:00 GMT
Cache-Control: no-cache
Cache-Control: no-store
Content-Type: text/html;charset=UTF-8
Content-Language: en
Date: Tue, 27 Aug 2013 13:05:05 GMT
Connection: close

foo body
with multiples lines
'''
        })
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as log:
            yield crawler.crawl(self.mockserver.url(f"/raw?{query}"),
                                mockserver=self.mockserver)
        self.assertEqual(str(log).count("Got response 200"), 1)

    @defer.inlineCallbacks
    def test_retry_conn_lost(self):
        # connection lost after receiving data
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as log:
            yield crawler.crawl(self.mockserver.url("/drop?abort=0"),
                                mockserver=self.mockserver)
        self._assert_retried(log)

    @defer.inlineCallbacks
    def test_retry_conn_aborted(self):
        # connection lost before receiving data
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as log:
            yield crawler.crawl(self.mockserver.url("/drop?abort=1"),
                                mockserver=self.mockserver)
        self._assert_retried(log)

    def _assert_retried(self, log):
        self.assertEqual(str(log).count("Retrying"), 2)
        self.assertEqual(str(log).count("Gave up retrying"), 1)

    @defer.inlineCallbacks
    def test_referer_header(self):
        """Referer header is set by RefererMiddleware unless it is already set"""
        req0 = Request(self.mockserver.url('/echo?headers=1&body=0'),
                       dont_filter=1)
        req1 = req0.replace()
        req2 = req0.replace(headers={'Referer': None})
        req3 = req0.replace(headers={'Referer': 'http://example.com'})
        req0.meta['next'] = req1
        req1.meta['next'] = req2
        req2.meta['next'] = req3
        crawler = self.runner.create_crawler(SingleRequestSpider)
        yield crawler.crawl(seed=req0, mockserver=self.mockserver)
        # basic asserts in case of weird communication errors
        self.assertIn('responses', crawler.spider.meta)
        self.assertNotIn('failures', crawler.spider.meta)
        # start requests doesn't set Referer header
        echo0 = json.loads(to_unicode(
            crawler.spider.meta['responses'][2].body))
        self.assertNotIn('Referer', echo0['headers'])
        # following request sets Referer to start request url
        echo1 = json.loads(to_unicode(
            crawler.spider.meta['responses'][1].body))
        self.assertEqual(echo1['headers'].get('Referer'), [req0.url])
        # next request avoids Referer header
        echo2 = json.loads(to_unicode(
            crawler.spider.meta['responses'][2].body))
        self.assertNotIn('Referer', echo2['headers'])
        # last request explicitly sets a Referer header
        echo3 = json.loads(to_unicode(
            crawler.spider.meta['responses'][3].body))
        self.assertEqual(echo3['headers'].get('Referer'),
                         ['http://example.com'])

    @defer.inlineCallbacks
    def test_engine_status(self):
        from scrapy.utils.engine import get_engine_status
        est = []

        def cb(response):
            est.append(get_engine_status(crawler.engine))

        crawler = self.runner.create_crawler(SingleRequestSpider)
        yield crawler.crawl(seed=self.mockserver.url('/'),
                            callback_func=cb,
                            mockserver=self.mockserver)
        self.assertEqual(len(est), 1, est)
        s = dict(est[0])
        self.assertEqual(s['engine.spider.name'], crawler.spider.name)
        self.assertEqual(s['len(engine.scraper.slot.active)'], 1)

    @defer.inlineCallbacks
    def test_format_engine_status(self):
        from scrapy.utils.engine import format_engine_status
        est = []

        def cb(response):
            est.append(format_engine_status(crawler.engine))

        crawler = self.runner.create_crawler(SingleRequestSpider)
        yield crawler.crawl(seed=self.mockserver.url('/'),
                            callback_func=cb,
                            mockserver=self.mockserver)
        self.assertEqual(len(est), 1, est)
        est = est[0].split("\n")[2:-2]  # remove header & footer
        # convert to dict
        est = [x.split(":") for x in est]
        est = [x for sublist in est for x in sublist]  # flatten
        est = [x.lstrip().rstrip() for x in est]
        it = iter(est)
        s = dict(zip(it, it))

        self.assertEqual(s['engine.spider.name'], crawler.spider.name)
        self.assertEqual(s['len(engine.scraper.slot.active)'], '1')

    @defer.inlineCallbacks
    def test_graceful_crawl_error_handling(self):
        """
        Test whether errors happening anywhere in Crawler.crawl() are properly
        reported (and not somehow swallowed) after a graceful engine shutdown.
        The errors should not come from within Scrapy's core but from within
        spiders/middlewares/etc., e.g. raised in Spider.start_requests(),
        SpiderMiddleware.process_start_requests(), etc.
        """
        class TestError(Exception):
            pass

        class FaultySpider(SimpleSpider):
            def start_requests(self):
                raise TestError

        crawler = self.runner.create_crawler(FaultySpider)
        yield self.assertFailure(crawler.crawl(mockserver=self.mockserver),
                                 TestError)
        self.assertFalse(crawler.crawling)

    @defer.inlineCallbacks
    def test_open_spider_error_on_faulty_pipeline(self):
        settings = {
            "ITEM_PIPELINES": {
                "tests.pipelines.ZeroDivisionErrorPipeline": 300,
            }
        }
        crawler = CrawlerRunner(settings).create_crawler(SimpleSpider)
        yield self.assertFailure(
            self.runner.crawl(crawler,
                              self.mockserver.url("/status?n=200"),
                              mockserver=self.mockserver), ZeroDivisionError)
        self.assertFalse(crawler.crawling)

    @defer.inlineCallbacks
    def test_crawlerrunner_accepts_crawler(self):
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as log:
            yield self.runner.crawl(crawler,
                                    self.mockserver.url("/status?n=200"),
                                    mockserver=self.mockserver)
        self.assertIn("Got response 200", str(log))

    @defer.inlineCallbacks
    def test_crawl_multiple(self):
        self.runner.crawl(SimpleSpider,
                          self.mockserver.url("/status?n=200"),
                          mockserver=self.mockserver)
        self.runner.crawl(SimpleSpider,
                          self.mockserver.url("/status?n=503"),
                          mockserver=self.mockserver)

        with LogCapture() as log:
            yield self.runner.join()

        self._assert_retried(log)
        self.assertIn("Got response 200", str(log))
Пример #41
0
class CrawlSpiderTestCase(TestCase):
    def setUp(self):
        self.mockserver = MockServer()
        self.mockserver.__enter__()
        self.runner = CrawlerRunner()

    def tearDown(self):
        self.mockserver.__exit__(None, None, None)

    @defer.inlineCallbacks
    def _run_spider(self, spider_cls):
        items = []

        def _on_item_scraped(item):
            items.append(item)

        crawler = self.runner.create_crawler(spider_cls)
        crawler.signals.connect(_on_item_scraped, signals.item_scraped)
        with LogCapture() as log:
            yield crawler.crawl(self.mockserver.url("/status?n=200"),
                                mockserver=self.mockserver)
        return log, items, crawler.stats

    @defer.inlineCallbacks
    def test_crawlspider_with_parse(self):
        self.runner.crawl(CrawlSpiderWithParseMethod,
                          mockserver=self.mockserver)

        with LogCapture() as log:
            yield self.runner.join()

        self.assertIn("[parse] status 200 (foo: None)", str(log))
        self.assertIn("[parse] status 201 (foo: None)", str(log))
        self.assertIn("[parse] status 202 (foo: bar)", str(log))

    @defer.inlineCallbacks
    def test_crawlspider_with_errback(self):
        self.runner.crawl(CrawlSpiderWithErrback, mockserver=self.mockserver)

        with LogCapture() as log:
            yield self.runner.join()

        self.assertIn("[parse] status 200 (foo: None)", str(log))
        self.assertIn("[parse] status 201 (foo: None)", str(log))
        self.assertIn("[parse] status 202 (foo: bar)", str(log))
        self.assertIn("[errback] status 404", str(log))
        self.assertIn("[errback] status 500", str(log))
        self.assertIn("[errback] status 501", str(log))

    @defer.inlineCallbacks
    def test_async_def_parse(self):
        self.runner.crawl(AsyncDefSpider,
                          self.mockserver.url("/status?n=200"),
                          mockserver=self.mockserver)
        with LogCapture() as log:
            yield self.runner.join()
        self.assertIn("Got response 200", str(log))

    @mark.only_asyncio()
    @defer.inlineCallbacks
    def test_async_def_asyncio_parse(self):
        runner = CrawlerRunner({
            "TWISTED_REACTOR":
            "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
        })
        runner.crawl(AsyncDefAsyncioSpider,
                     self.mockserver.url("/status?n=200"),
                     mockserver=self.mockserver)
        with LogCapture() as log:
            yield runner.join()
        self.assertIn("Got response 200", str(log))

    @mark.only_asyncio()
    @defer.inlineCallbacks
    def test_async_def_asyncio_parse_items_list(self):
        log, items, _ = yield self._run_spider(AsyncDefAsyncioReturnSpider)
        self.assertIn("Got response 200", str(log))
        self.assertIn({'id': 1}, items)
        self.assertIn({'id': 2}, items)

    @mark.only_asyncio()
    @defer.inlineCallbacks
    def test_async_def_asyncio_parse_items_single_element(self):
        items = []

        def _on_item_scraped(item):
            items.append(item)

        crawler = self.runner.create_crawler(
            AsyncDefAsyncioReturnSingleElementSpider)
        crawler.signals.connect(_on_item_scraped, signals.item_scraped)
        with LogCapture() as log:
            yield crawler.crawl(self.mockserver.url("/status?n=200"),
                                mockserver=self.mockserver)
        self.assertIn("Got response 200", str(log))
        self.assertIn({"foo": 42}, items)

    @mark.only_asyncio()
    @defer.inlineCallbacks
    def test_async_def_asyncgen_parse(self):
        log, _, stats = yield self._run_spider(AsyncDefAsyncioGenSpider)
        self.assertIn("Got response 200", str(log))
        itemcount = stats.get_value('item_scraped_count')
        self.assertEqual(itemcount, 1)

    @mark.only_asyncio()
    @defer.inlineCallbacks
    def test_async_def_asyncgen_parse_loop(self):
        log, items, stats = yield self._run_spider(
            AsyncDefAsyncioGenLoopSpider)
        self.assertIn("Got response 200", str(log))
        itemcount = stats.get_value('item_scraped_count')
        self.assertEqual(itemcount, 10)
        for i in range(10):
            self.assertIn({'foo': i}, items)

    @mark.only_asyncio()
    @defer.inlineCallbacks
    def test_async_def_asyncgen_parse_complex(self):
        _, items, stats = yield self._run_spider(
            AsyncDefAsyncioGenComplexSpider)
        itemcount = stats.get_value('item_scraped_count')
        self.assertEqual(itemcount, 156)
        # some random items
        for i in [1, 4, 21, 22, 207, 311]:
            self.assertIn({'index': i}, items)
        for i in [10, 30, 122]:
            self.assertIn({'index2': i}, items)

    @mark.only_asyncio()
    @defer.inlineCallbacks
    def test_async_def_asyncio_parse_reqs_list(self):
        log, *_ = yield self._run_spider(AsyncDefAsyncioReqsReturnSpider)
        for req_id in range(3):
            self.assertIn(f"Got response 200, req_id {req_id}", str(log))

    @defer.inlineCallbacks
    def test_response_ssl_certificate_none(self):
        crawler = self.runner.create_crawler(SingleRequestSpider)
        url = self.mockserver.url("/echo?body=test", is_secure=False)
        yield crawler.crawl(seed=url, mockserver=self.mockserver)
        self.assertIsNone(crawler.spider.meta['responses'][0].certificate)

    @defer.inlineCallbacks
    def test_response_ssl_certificate(self):
        crawler = self.runner.create_crawler(SingleRequestSpider)
        url = self.mockserver.url("/echo?body=test", is_secure=True)
        yield crawler.crawl(seed=url, mockserver=self.mockserver)
        cert = crawler.spider.meta['responses'][0].certificate
        self.assertIsInstance(cert, Certificate)
        self.assertEqual(cert.getSubject().commonName, b"localhost")
        self.assertEqual(cert.getIssuer().commonName, b"localhost")

    @mark.xfail(
        reason="Responses with no body return early and contain no certificate"
    )
    @defer.inlineCallbacks
    def test_response_ssl_certificate_empty_response(self):
        crawler = self.runner.create_crawler(SingleRequestSpider)
        url = self.mockserver.url("/status?n=200", is_secure=True)
        yield crawler.crawl(seed=url, mockserver=self.mockserver)
        cert = crawler.spider.meta['responses'][0].certificate
        self.assertIsInstance(cert, Certificate)
        self.assertEqual(cert.getSubject().commonName, b"localhost")
        self.assertEqual(cert.getIssuer().commonName, b"localhost")

    @defer.inlineCallbacks
    def test_dns_server_ip_address_none(self):
        crawler = self.runner.create_crawler(SingleRequestSpider)
        url = self.mockserver.url('/status?n=200')
        yield crawler.crawl(seed=url, mockserver=self.mockserver)
        ip_address = crawler.spider.meta['responses'][0].ip_address
        self.assertIsNone(ip_address)

    @defer.inlineCallbacks
    def test_dns_server_ip_address(self):
        crawler = self.runner.create_crawler(SingleRequestSpider)
        url = self.mockserver.url('/echo?body=test')
        expected_netloc, _ = urlparse(url).netloc.split(':')
        yield crawler.crawl(seed=url, mockserver=self.mockserver)
        ip_address = crawler.spider.meta['responses'][0].ip_address
        self.assertIsInstance(ip_address, IPv4Address)
        self.assertEqual(str(ip_address), gethostbyname(expected_netloc))

    @defer.inlineCallbacks
    def test_bytes_received_stop_download_callback(self):
        crawler = self.runner.create_crawler(BytesReceivedCallbackSpider)
        yield crawler.crawl(mockserver=self.mockserver)
        self.assertIsNone(crawler.spider.meta.get("failure"))
        self.assertIsInstance(crawler.spider.meta["response"], Response)
        self.assertEqual(crawler.spider.meta["response"].body,
                         crawler.spider.meta.get("bytes_received"))
        self.assertLess(len(crawler.spider.meta["response"].body),
                        crawler.spider.full_response_length)

    @defer.inlineCallbacks
    def test_bytes_received_stop_download_errback(self):
        crawler = self.runner.create_crawler(BytesReceivedErrbackSpider)
        yield crawler.crawl(mockserver=self.mockserver)
        self.assertIsNone(crawler.spider.meta.get("response"))
        self.assertIsInstance(crawler.spider.meta["failure"], Failure)
        self.assertIsInstance(crawler.spider.meta["failure"].value,
                              StopDownload)
        self.assertIsInstance(crawler.spider.meta["failure"].value.response,
                              Response)
        self.assertEqual(crawler.spider.meta["failure"].value.response.body,
                         crawler.spider.meta.get("bytes_received"))
        self.assertLess(
            len(crawler.spider.meta["failure"].value.response.body),
            crawler.spider.full_response_length)

    @defer.inlineCallbacks
    def test_headers_received_stop_download_callback(self):
        crawler = self.runner.create_crawler(HeadersReceivedCallbackSpider)
        yield crawler.crawl(mockserver=self.mockserver)
        self.assertIsNone(crawler.spider.meta.get("failure"))
        self.assertIsInstance(crawler.spider.meta["response"], Response)
        self.assertEqual(crawler.spider.meta["response"].headers,
                         crawler.spider.meta.get("headers_received"))

    @defer.inlineCallbacks
    def test_headers_received_stop_download_errback(self):
        crawler = self.runner.create_crawler(HeadersReceivedErrbackSpider)
        yield crawler.crawl(mockserver=self.mockserver)
        self.assertIsNone(crawler.spider.meta.get("response"))
        self.assertIsInstance(crawler.spider.meta["failure"], Failure)
        self.assertIsInstance(crawler.spider.meta["failure"].value,
                              StopDownload)
        self.assertIsInstance(crawler.spider.meta["failure"].value.response,
                              Response)
        self.assertEqual(crawler.spider.meta["failure"].value.response.headers,
                         crawler.spider.meta.get("headers_received"))
Пример #42
0
class BookCrawler:
    """Crawl the provided book, exporting the crawled images"""

    exporter = None

    def __init__(self,
                 provider: CrawlSpider,
                 slug: str,
                 output_dir: str,
                 verbose=False):

        self.provider = provider
        self.slug = slug
        self.output_dir = output_dir
        self.verbose = verbose

        if verbose:
            configure_logging()

        self.runner = CrawlerRunner()

    def get_volumes_links(self) -> Iterable[str]:
        """Get the available list of volumes links for the wanted book slug"""

        scraper = cfscrape.create_scraper()
        book_url = (f'http://{self.provider.allowed_domains[0]}/'
                    f'{self.provider.url_key}/{self.slug}')

        response = scraper.get(book_url)
        document = document_fromstring(response.text)
        volume_elements = document.xpath(
            '//table[@class="listing"]//tr[position()>2]')

        if not volume_elements:
            raise BookScrapeException('No volumes found for the "%s" slug' %
                                      self.slug)

        volumes = []

        for index, volume_element in enumerate(volume_elements):
            volume_link = volume_element.xpath('./td[1]/a/@href')[0]
            volumes.append(volume_link)

        volumes.reverse()

        return volumes

    def run(self, volume_start: int, volume_end: int):
        self.exporter = PdfExporter(
            self.output_dir,
            os.path.join(self.output_dir, 'images'),
            file_name='%s_%s.pdf' % (self.slug, '-'.join(
                [str(volume_start), str(volume_end)])))
        logger.info(
            'Crawling started for the book slug "%s" on the "%s" provider.',
            self.slug, self.provider.name)

        volumes_list = list(range(volume_start, volume_end + 1))
        self.crawl(volumes_list)
        reactor.run()

    @staticmethod
    def _on_error(failure):
        if isinstance(failure.value, BookScrapeException):
            logger.error(str(failure.value))
        else:
            logger.error(failure)

    def _get_crawler(self) -> Crawler:
        crawler = Crawler(self.provider,
                          settings={
                              'IMAGES_STORE':
                              os.path.join(self.output_dir, 'images'),
                              **SETTINGS
                          })
        crawler.signals.connect(self._on_error, signals.spider_error)
        crawler.signals.connect(self.exporter.export_item,
                                signals.item_scraped)

        return crawler

    @defer.inlineCallbacks
    def crawl(self, volumes: Iterable[int]):
        yield self.runner.crawl(self._get_crawler(),
                                book_slug=self.slug,
                                volumes=volumes)

        reactor.stop()
        self.exporter.finish_exporting()
Пример #43
0
from scrapy.crawler import CrawlerProcess
from spiders.FlatSpider import FlatSpider
from spiders.HouseSpider import HouseSpider

from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging


configure_logging()
runner = CrawlerRunner()
runner.crawl(FlatSpider)
runner.crawl(HouseSpider)
d = runner.join()
d.addBoth(lambda _: reactor.stop())

reactor.run()
Пример #44
0
class CrawlTestCase(TestCase):
    def setUp(self):
        self.mockserver = MockServer()
        self.mockserver.__enter__()
        self.runner = CrawlerRunner()

    def tearDown(self):
        self.mockserver.__exit__(None, None, None)

    @defer.inlineCallbacks
    def test_follow_all(self):
        crawler = self.runner.create_crawler(FollowAllSpider)
        yield crawler.crawl(mockserver=self.mockserver)
        self.assertEqual(len(crawler.spider.urls_visited),
                         11)  # 10 + start_url

    @defer.inlineCallbacks
    def test_fixed_delay(self):
        yield self._test_delay(total=3, delay=0.2)

    @defer.inlineCallbacks
    def test_randomized_delay(self):
        yield self._test_delay(total=3, delay=0.1, randomize=True)

    @defer.inlineCallbacks
    def _test_delay(self, total, delay, randomize=False):
        crawl_kwargs = dict(
            maxlatency=delay * 2,
            mockserver=self.mockserver,
            total=total,
        )
        tolerance = (1 - (0.6 if randomize else 0.2))

        settings = {
            "DOWNLOAD_DELAY": delay,
            'RANDOMIZE_DOWNLOAD_DELAY': randomize
        }
        crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider)
        yield crawler.crawl(**crawl_kwargs)
        times = crawler.spider.times
        total_time = times[-1] - times[0]
        average = total_time / (len(times) - 1)
        self.assertTrue(average > delay * tolerance,
                        "download delay too small: %s" % average)

        # Ensure that the same test parameters would cause a failure if no
        # download delay is set. Otherwise, it means we are using a combination
        # of ``total`` and ``delay`` values that are too small for the test
        # code above to have any meaning.
        settings["DOWNLOAD_DELAY"] = 0
        crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider)
        yield crawler.crawl(**crawl_kwargs)
        times = crawler.spider.times
        total_time = times[-1] - times[0]
        average = total_time / (len(times) - 1)
        self.assertFalse(average > delay / tolerance,
                         "test total or delay values are too small")

    @defer.inlineCallbacks
    def test_timeout_success(self):
        crawler = self.runner.create_crawler(DelaySpider)
        yield crawler.crawl(n=0.5, mockserver=self.mockserver)
        self.assertTrue(crawler.spider.t1 > 0)
        self.assertTrue(crawler.spider.t2 > 0)
        self.assertTrue(crawler.spider.t2 > crawler.spider.t1)

    @defer.inlineCallbacks
    def test_timeout_failure(self):
        crawler = CrawlerRunner({
            "DOWNLOAD_TIMEOUT": 0.35
        }).create_crawler(DelaySpider)
        yield crawler.crawl(n=0.5, mockserver=self.mockserver)
        self.assertTrue(crawler.spider.t1 > 0)
        self.assertTrue(crawler.spider.t2 == 0)
        self.assertTrue(crawler.spider.t2_err > 0)
        self.assertTrue(crawler.spider.t2_err > crawler.spider.t1)
        # server hangs after receiving response headers
        yield crawler.crawl(n=0.5, b=1, mockserver=self.mockserver)
        self.assertTrue(crawler.spider.t1 > 0)
        self.assertTrue(crawler.spider.t2 == 0)
        self.assertTrue(crawler.spider.t2_err > 0)
        self.assertTrue(crawler.spider.t2_err > crawler.spider.t1)

    @defer.inlineCallbacks
    def test_retry_503(self):
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as log:
            yield crawler.crawl(self.mockserver.url("/status?n=503"),
                                mockserver=self.mockserver)
        self._assert_retried(log)

    @defer.inlineCallbacks
    def test_retry_conn_failed(self):
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as log:
            yield crawler.crawl("http://localhost:65432/status?n=503",
                                mockserver=self.mockserver)
        self._assert_retried(log)

    @defer.inlineCallbacks
    def test_retry_dns_error(self):
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as log:
            # try to fetch the homepage of a non-existent domain
            yield crawler.crawl("http://dns.resolution.invalid./",
                                mockserver=self.mockserver)
        self._assert_retried(log)

    @defer.inlineCallbacks
    def test_start_requests_bug_before_yield(self):
        with LogCapture('scrapy', level=logging.ERROR) as log:
            crawler = self.runner.create_crawler(BrokenStartRequestsSpider)
            yield crawler.crawl(fail_before_yield=1,
                                mockserver=self.mockserver)

        self.assertEqual(len(log.records), 1)
        record = log.records[0]
        self.assertIsNotNone(record.exc_info)
        self.assertIs(record.exc_info[0], ZeroDivisionError)

    @defer.inlineCallbacks
    def test_start_requests_bug_yielding(self):
        with LogCapture('scrapy', level=logging.ERROR) as log:
            crawler = self.runner.create_crawler(BrokenStartRequestsSpider)
            yield crawler.crawl(fail_yielding=1, mockserver=self.mockserver)

        self.assertEqual(len(log.records), 1)
        record = log.records[0]
        self.assertIsNotNone(record.exc_info)
        self.assertIs(record.exc_info[0], ZeroDivisionError)

    @defer.inlineCallbacks
    def test_start_requests_lazyness(self):
        settings = {"CONCURRENT_REQUESTS": 1}
        crawler = CrawlerRunner(settings).create_crawler(
            BrokenStartRequestsSpider)
        yield crawler.crawl(mockserver=self.mockserver)
        self.assertTrue(
            crawler.spider.seedsseen.index(None) <
            crawler.spider.seedsseen.index(99), crawler.spider.seedsseen)

    @defer.inlineCallbacks
    def test_start_requests_dupes(self):
        settings = {"CONCURRENT_REQUESTS": 1}
        crawler = CrawlerRunner(settings).create_crawler(
            DuplicateStartRequestsSpider)
        yield crawler.crawl(dont_filter=True,
                            distinct_urls=2,
                            dupe_factor=3,
                            mockserver=self.mockserver)
        self.assertEqual(crawler.spider.visited, 6)

        yield crawler.crawl(dont_filter=False,
                            distinct_urls=3,
                            dupe_factor=4,
                            mockserver=self.mockserver)
        self.assertEqual(crawler.spider.visited, 3)

    @defer.inlineCallbacks
    def test_unbounded_response(self):
        # Completeness of responses without Content-Length or Transfer-Encoding
        # can not be determined, we treat them as valid but flagged as "partial"
        from urllib.parse import urlencode
        query = urlencode({
            'raw':
            '''\
HTTP/1.1 200 OK
Server: Apache-Coyote/1.1
X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0
Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/
Pragma: no-cache
Expires: Thu, 01 Jan 1970 00:00:00 GMT
Cache-Control: no-cache
Cache-Control: no-store
Content-Type: text/html;charset=UTF-8
Content-Language: en
Date: Tue, 27 Aug 2013 13:05:05 GMT
Connection: close

foo body
with multiples lines
'''
        })
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as log:
            yield crawler.crawl(self.mockserver.url("/raw?{0}".format(query)),
                                mockserver=self.mockserver)
        self.assertEqual(str(log).count("Got response 200"), 1)

    @defer.inlineCallbacks
    def test_retry_conn_lost(self):
        # connection lost after receiving data
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as log:
            yield crawler.crawl(self.mockserver.url("/drop?abort=0"),
                                mockserver=self.mockserver)
        self._assert_retried(log)

    @defer.inlineCallbacks
    def test_retry_conn_aborted(self):
        # connection lost before receiving data
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as log:
            yield crawler.crawl(self.mockserver.url("/drop?abort=1"),
                                mockserver=self.mockserver)
        self._assert_retried(log)

    def _assert_retried(self, log):
        self.assertEqual(str(log).count("Retrying"), 2)
        self.assertEqual(str(log).count("Gave up retrying"), 1)

    @defer.inlineCallbacks
    def test_referer_header(self):
        """Referer header is set by RefererMiddleware unless it is already set"""
        req0 = Request(self.mockserver.url('/echo?headers=1&body=0'),
                       dont_filter=1)
        req1 = req0.replace()
        req2 = req0.replace(headers={'Referer': None})
        req3 = req0.replace(headers={'Referer': 'http://example.com'})
        req0.meta['next'] = req1
        req1.meta['next'] = req2
        req2.meta['next'] = req3
        crawler = self.runner.create_crawler(SingleRequestSpider)
        yield crawler.crawl(seed=req0, mockserver=self.mockserver)
        # basic asserts in case of weird communication errors
        self.assertIn('responses', crawler.spider.meta)
        self.assertNotIn('failures', crawler.spider.meta)
        # start requests doesn't set Referer header
        echo0 = json.loads(to_unicode(
            crawler.spider.meta['responses'][2].body))
        self.assertNotIn('Referer', echo0['headers'])
        # following request sets Referer to start request url
        echo1 = json.loads(to_unicode(
            crawler.spider.meta['responses'][1].body))
        self.assertEqual(echo1['headers'].get('Referer'), [req0.url])
        # next request avoids Referer header
        echo2 = json.loads(to_unicode(
            crawler.spider.meta['responses'][2].body))
        self.assertNotIn('Referer', echo2['headers'])
        # last request explicitly sets a Referer header
        echo3 = json.loads(to_unicode(
            crawler.spider.meta['responses'][3].body))
        self.assertEqual(echo3['headers'].get('Referer'),
                         ['http://example.com'])

    @defer.inlineCallbacks
    def test_engine_status(self):
        from scrapy.utils.engine import get_engine_status
        est = []

        def cb(response):
            est.append(get_engine_status(crawler.engine))

        crawler = self.runner.create_crawler(SingleRequestSpider)
        yield crawler.crawl(seed=self.mockserver.url('/'),
                            callback_func=cb,
                            mockserver=self.mockserver)
        self.assertEqual(len(est), 1, est)
        s = dict(est[0])
        self.assertEqual(s['engine.spider.name'], crawler.spider.name)
        self.assertEqual(s['len(engine.scraper.slot.active)'], 1)

    @defer.inlineCallbacks
    def test_graceful_crawl_error_handling(self):
        """
        Test whether errors happening anywhere in Crawler.crawl() are properly
        reported (and not somehow swallowed) after a graceful engine shutdown.
        The errors should not come from within Scrapy's core but from within
        spiders/middlewares/etc., e.g. raised in Spider.start_requests(),
        SpiderMiddleware.process_start_requests(), etc.
        """
        class TestError(Exception):
            pass

        class FaultySpider(SimpleSpider):
            def start_requests(self):
                raise TestError

        crawler = self.runner.create_crawler(FaultySpider)
        yield self.assertFailure(crawler.crawl(mockserver=self.mockserver),
                                 TestError)
        self.assertFalse(crawler.crawling)

    @defer.inlineCallbacks
    def test_open_spider_error_on_faulty_pipeline(self):
        settings = {
            "ITEM_PIPELINES": {
                "tests.pipelines.ZeroDivisionErrorPipeline": 300,
            }
        }
        crawler = CrawlerRunner(settings).create_crawler(SimpleSpider)
        yield self.assertFailure(
            self.runner.crawl(crawler,
                              self.mockserver.url("/status?n=200"),
                              mockserver=self.mockserver), ZeroDivisionError)
        self.assertFalse(crawler.crawling)

    @defer.inlineCallbacks
    def test_crawlerrunner_accepts_crawler(self):
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as log:
            yield self.runner.crawl(crawler,
                                    self.mockserver.url("/status?n=200"),
                                    mockserver=self.mockserver)
        self.assertIn("Got response 200", str(log))

    @defer.inlineCallbacks
    def test_crawl_multiple(self):
        self.runner.crawl(SimpleSpider,
                          self.mockserver.url("/status?n=200"),
                          mockserver=self.mockserver)
        self.runner.crawl(SimpleSpider,
                          self.mockserver.url("/status?n=503"),
                          mockserver=self.mockserver)

        with LogCapture() as log:
            yield self.runner.join()

        self._assert_retried(log)
        self.assertIn("Got response 200", str(log))

    @defer.inlineCallbacks
    def test_crawlspider_with_errback(self):
        self.runner.crawl(CrawlSpiderWithErrback, mockserver=self.mockserver)

        with LogCapture() as log:
            yield self.runner.join()

        self.assertIn("[callback] status 200", str(log))
        self.assertIn("[callback] status 201", str(log))
        self.assertIn("[errback] status 404", str(log))
        self.assertIn("[errback] status 500", str(log))

    @defer.inlineCallbacks
    def test_async_def_parse(self):
        self.runner.crawl(AsyncDefSpider,
                          self.mockserver.url("/status?n=200"),
                          mockserver=self.mockserver)
        with LogCapture() as log:
            yield self.runner.join()
        self.assertIn("Got response 200", str(log))

    @mark.only_asyncio()
    @defer.inlineCallbacks
    def test_async_def_asyncio_parse(self):
        runner = CrawlerRunner({
            "TWISTED_REACTOR":
            "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
        })
        runner.crawl(AsyncDefAsyncioSpider,
                     self.mockserver.url("/status?n=200"),
                     mockserver=self.mockserver)
        with LogCapture() as log:
            yield runner.join()
        self.assertIn("Got response 200", str(log))

    @mark.only_asyncio()
    @defer.inlineCallbacks
    def test_async_def_asyncio_parse_items_list(self):
        items = []

        def _on_item_scraped(item):
            items.append(item)

        crawler = self.runner.create_crawler(AsyncDefAsyncioReturnSpider)
        crawler.signals.connect(_on_item_scraped, signals.item_scraped)
        with LogCapture() as log:
            yield crawler.crawl(self.mockserver.url("/status?n=200"),
                                mockserver=self.mockserver)
        self.assertIn("Got response 200", str(log))
        self.assertIn({'id': 1}, items)
        self.assertIn({'id': 2}, items)

    @mark.skipif(sys.version_info < (3, 6),
                 reason="Async generators require Python 3.6 or higher")
    @mark.only_asyncio()
    @defer.inlineCallbacks
    def test_async_def_asyncgen_parse(self):
        from tests.py36._test_crawl import AsyncDefAsyncioGenSpider
        crawler = self.runner.create_crawler(AsyncDefAsyncioGenSpider)
        with LogCapture() as log:
            yield crawler.crawl(self.mockserver.url("/status?n=200"),
                                mockserver=self.mockserver)
        self.assertIn("Got response 200", str(log))
        itemcount = crawler.stats.get_value('item_scraped_count')
        self.assertEqual(itemcount, 1)

    @mark.skipif(sys.version_info < (3, 6),
                 reason="Async generators require Python 3.6 or higher")
    @mark.only_asyncio()
    @defer.inlineCallbacks
    def test_async_def_asyncgen_parse_loop(self):
        items = []

        def _on_item_scraped(item):
            items.append(item)

        from tests.py36._test_crawl import AsyncDefAsyncioGenLoopSpider
        crawler = self.runner.create_crawler(AsyncDefAsyncioGenLoopSpider)
        crawler.signals.connect(_on_item_scraped, signals.item_scraped)
        with LogCapture() as log:
            yield crawler.crawl(self.mockserver.url("/status?n=200"),
                                mockserver=self.mockserver)
        self.assertIn("Got response 200", str(log))
        itemcount = crawler.stats.get_value('item_scraped_count')
        self.assertEqual(itemcount, 10)
        for i in range(10):
            self.assertIn({'foo': i}, items)

    @mark.skipif(sys.version_info < (3, 6),
                 reason="Async generators require Python 3.6 or higher")
    @mark.only_asyncio()
    @defer.inlineCallbacks
    def test_async_def_asyncgen_parse_complex(self):
        items = []

        def _on_item_scraped(item):
            items.append(item)

        from tests.py36._test_crawl import AsyncDefAsyncioGenComplexSpider
        crawler = self.runner.create_crawler(AsyncDefAsyncioGenComplexSpider)
        crawler.signals.connect(_on_item_scraped, signals.item_scraped)
        yield crawler.crawl(mockserver=self.mockserver)
        itemcount = crawler.stats.get_value('item_scraped_count')
        self.assertEqual(itemcount, 156)
        # some random items
        for i in [1, 4, 21, 22, 207, 311]:
            self.assertIn({'index': i}, items)
        for i in [10, 30, 122]:
            self.assertIn({'index2': i}, items)

    @mark.only_asyncio()
    @defer.inlineCallbacks
    def test_async_def_asyncio_parse_reqs_list(self):
        crawler = self.runner.create_crawler(AsyncDefAsyncioReqsReturnSpider)
        with LogCapture() as log:
            yield crawler.crawl(self.mockserver.url("/status?n=200"),
                                mockserver=self.mockserver)
        for req_id in range(3):
            self.assertIn("Got response 200, req_id %d" % req_id, str(log))

    @defer.inlineCallbacks
    def test_response_ssl_certificate_none(self):
        crawler = self.runner.create_crawler(SingleRequestSpider)
        url = self.mockserver.url("/echo?body=test", is_secure=False)
        yield crawler.crawl(seed=url, mockserver=self.mockserver)
        self.assertIsNone(crawler.spider.meta['responses'][0].certificate)

    @defer.inlineCallbacks
    def test_response_ssl_certificate(self):
        crawler = self.runner.create_crawler(SingleRequestSpider)
        url = self.mockserver.url("/echo?body=test", is_secure=True)
        yield crawler.crawl(seed=url, mockserver=self.mockserver)
        cert = crawler.spider.meta['responses'][0].certificate
        self.assertIsInstance(cert, Certificate)
        self.assertEqual(cert.getSubject().commonName, b"localhost")
        self.assertEqual(cert.getIssuer().commonName, b"localhost")

    @mark.xfail(
        reason="Responses with no body return early and contain no certificate"
    )
    @defer.inlineCallbacks
    def test_response_ssl_certificate_empty_response(self):
        crawler = self.runner.create_crawler(SingleRequestSpider)
        url = self.mockserver.url("/status?n=200", is_secure=True)
        yield crawler.crawl(seed=url, mockserver=self.mockserver)
        cert = crawler.spider.meta['responses'][0].certificate
        self.assertIsInstance(cert, Certificate)
        self.assertEqual(cert.getSubject().commonName, b"localhost")
        self.assertEqual(cert.getIssuer().commonName, b"localhost")

    @defer.inlineCallbacks
    def test_dns_server_ip_address_none(self):
        crawler = self.runner.create_crawler(SingleRequestSpider)
        url = self.mockserver.url('/status?n=200')
        yield crawler.crawl(seed=url, mockserver=self.mockserver)
        ip_address = crawler.spider.meta['responses'][0].ip_address
        self.assertIsNone(ip_address)

    @defer.inlineCallbacks
    def test_dns_server_ip_address(self):
        crawler = self.runner.create_crawler(SingleRequestSpider)
        url = self.mockserver.url('/echo?body=test')
        expected_netloc, _ = urlparse(url).netloc.split(':')
        yield crawler.crawl(seed=url, mockserver=self.mockserver)
        ip_address = crawler.spider.meta['responses'][0].ip_address
        self.assertIsInstance(ip_address, IPv4Address)
        self.assertEqual(str(ip_address), gethostbyname(expected_netloc))
Пример #45
0
from orangespider.models import ArticleRule, BookRule
from orangespider.spiders.article_spider import ArticleSpider
from orangespider.spiders.book_spider import BookSpider

if __name__ == '__main__':
    settings = get_project_settings()
    configure_logging(settings)
    db = db_connect()
    Session = sessionmaker(bind=db)
    session = Session()
    # Load ArticleRule
    article_rules = session.query(ArticleRule).filter(
        ArticleRule.enable == 1).all()
    # Load BookRule
    book_rules = session.query(BookRule).filter(BookRule.enable == 1).all()
    session.close()
    runner = CrawlerRunner(settings)

    # init ArticleSpider
    # for article_rule in article_rules:
    #     runner.crawl(ArticleSpider, rule=article_rule)

    # init BookSpider
    for book_rule in book_rules:
        runner.crawl(BookSpider, rule=book_rule)

    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
    logging.info('Spider Finished!')
Пример #46
0
 def setUp(self):
     self.mockserver = MockServer()
     self.mockserver.__enter__()
     self.runner = CrawlerRunner()
Пример #47
0
    allowed_domains = ['amazon.sa']
    start_urls = start_urls
    user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'

    def parse(self, response):
        data = response.css('#cm_cr-review_list')
        star_rating = data.css('.review-rating')
        comments = data.css('.review-text')
        count = 0
        for review in star_rating:
            yield {'stars': ''.join(review.xpath('.//text()').extract()),
                    'comment': ''.join(comments[count].xpath(".//text()").extract())
                    }
            count += 1
        next_page = response.css('.a-last a ::attr(href)').extract_first()
        if next_page :
            yield scrapy.Request(response.urljoin(next_page), callback=self.parse)

configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
logger = logging.getLogger('scrapy.core.scraper')
logger.setLevel('INFO')
runner = CrawlerRunner(settings={
    "FEEDS": {
        "reviews2.csv": {"format": "csv"},
    },
})

d = runner.crawl(AmazonReviewsSpider)
d.addBoth(lambda _: reactor.stop())
reactor.run()  # the script will block here until the crawling is finished
Пример #48
0
def main(argv):
    inputfile = ''
    try:
        opts, args = getopt.getopt(argv, "hi:", ["ifile="])
    except getopt.GetoptError:
        print('crawlers.py -i <inputfile>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('crawlers.py -i <inputfile>')
            sys.exit()
        elif opt in ("-i", "--ifile="):
            inputfile = arg

    ticks = list(read_file(inputfile))
    # Create and run spiders
    configure_logging()
    crawler_settings = Settings()
    crawler_settings.setmodule(my_settings)
    runner = CrawlerRunner(settings=crawler_settings)

    for tick in ticks:
        kwargs = {'tick': tick}
        runner.crawl(MWSpider, **kwargs)
        runner.crawl(ReutersSpider, **kwargs)
        runner.crawl(BloSpider, **kwargs)
        runner.crawl(MSNBCSpider, **kwargs)

    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
Пример #49
0
    def __init__(self, id_list, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.id_list = id_list

        configure_logging()
        self.runner = CrawlerRunner(get_project_settings())
Пример #50
0
 def test_crawler_runner_accepts_None(self):
     runner = CrawlerRunner()
     self.assertOptionIsDefault(runner.settings, 'RETRY_ENABLED')
Пример #51
0
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from weibo.spiders.topspider import TopspiderSpider
from weibo.spiders.weibospider import WeibospiderSpider
import logging
from scrapy.utils.project import get_project_settings

configure_logging()
runner = CrawlerRunner(get_project_settings())


@defer.inlineCallbacks
def crawl():

    yield runner.crawl(TopspiderSpider)
    logging.info("TopspiderSpider is stopped")
    yield runner.crawl(WeibospiderSpider)
    reactor.stop()


while True:
    logging.info("new cycle is starting")
    crawl()
    reactor.run(
    )  # the script will block here until the last crawl call is finished
Пример #52
0
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
import json
# 加入项目配置文件
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
reactor.suggestThreadPoolSize(30)
runner = CrawlerRunner(get_project_settings())
# 导入爬虫
from bugs.spiders.jd_spider import JDSpoder
# urls = []
# for line in open('../data/jd-goods-id.json', 'r',encoding="utf8"):
#     data = json.loads(line)
#     url = 'https://item.jd.com/{id}.html'.format(id = data['goods_id'])
#     urls.append(url)
# 'https://item.jd.com/14786160283.html',
urls = [
    'https://item.yiyaojd.com/13674344768.html',
    'https://item.yiyaojd.com/3154005.html',
    'https://item.yiyaojd.com/4410004.html',
    'https://item.yiyaojd.com/13214528412.html',
    'https://item.yiyaojd.com/4808847.html',
    'https://item.yiyaojd.com/3091800.html',
    'https://item.yiyaojd.com/3108320.html',
    'https://item.yiyaojd.com/12801590412.html',
    'https://item.yiyaojd.com/3172871.html',
    'https://item.yiyaojd.com/17990313602.html'
]
for i in range(len(urls)):
    kwargs = {'url': '{}'.format(urls[i])}
Пример #53
0
    # # process.crawl("lawlib_xinshi_spider", domain={"law-lib.com"})
    # # process.crawl("lawlib_minshi_spider", domain={"law-lib.com"})
    # # process.crawl("lawlib_xinzhen_spider", domain={"law-lib.com"})
    #
    # process.crawl("qq_news_spider", domain={"qq.com"})
    #
    # process.crawl("sina_news_spider", domain={"sina.com.cn"})
    # process.crawl("sina_sifa_news_spider", domain={"sina.com.cn"})
    # process.crawl("sina_sifa_publicity_spider", domain={"sina.com.cn"})
    #
    # process.start()

    # running the spiders sequentially by chaining the deferreds:
    configure_logging()
    settings = get_project_settings()
    runner = CrawlerRunner(settings=settings)

    @defer.inlineCallbacks
    def crawl():
        # china
        yield runner.crawl(china.ChinaNewsSpider)
        yield runner.crawl(china.ChinaTheorySpider)
        yield runner.crawl(china.ChinaAffairSpider)

        # cctv
        yield runner.crawl(cctv.CCTVShipingSpider)
        yield runner.crawl(cctv.CCTVNewsSpider)
        yield runner.crawl(cctv.CCTVCaijingSpider)

        # chinadaily
        yield runner.crawl(chinadaily.ChinadailyChinaSpider)
Пример #54
0
from scrapy.utils.log import configure_logging


# 爬取指定关键词的shopUrl
class Img(scrapy.Spider):
    name = "img"

    def __init__(self, name=None, **kwargs):
        super().__init__(name, **kwargs)

    def start_requests(self):
        url = "http://www.meituan.com/"
        yield scrapy.Request(url=url,
                             callback=self.parse,
                             meta={"cookiejar": 1})

    def parse(self, response):
        print(response.text)
        yield scrapy.Request(url="http://www.meituan.com/deal/47840801.html",
                             callback=self.parse,
                             dont_filter=True,
                             meta={"cookiejar": response.meta["cookiejar"]})


if __name__ == "__main__":
    configure_logging({"LOG_FORMAT": "%(levelname)s: %(message)s"})
    runner = CrawlerRunner(get_project_settings())
    d = runner.crawl(Img)
    d.addBoth(lambda _: reactor.stop())
    reactor.run()  # the script will block here until the crawling is finished
Пример #55
0
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
from scrapy.utils.log import configure_logging
from coolscrapy.models import db_connect, create_news_table
from coolscrapy.models import ArticleRule
from sqlalchemy.orm import sessionmaker
from coolscrapy.spiders.article_spider import ArticleSpider

if __name__ == '__main__':
    settings = get_project_settings()
    configure_logging(settings)
    db = db_connect()
    Session = sessionmaker(bind=db)
    session = Session()
    rules = session.query(ArticleRule).filter(ArticleRule.enable == 1).all()
    session.close()
    runner = CrawlerRunner(settings)

    for rule in rules:
        # spider = ArticleSpider(rule)  # instantiate every spider using rule
        # stop reactor when spider closes
        # runner.signals.connect(spider_closing, signal=signals.spider_closed)
        runner.crawl(ArticleSpider, rule=rule)

    d = runner.join()
    d.addBoth(lambda _: reactor.stop())

    # blocks process so always keep as the last statement
    reactor.run()
    logging.info('all finished.')
Пример #56
0
def handler(event, context):
    runner = CrawlerRunner(get_project_settings())
    d = runner.crawl('github_trend_crawler', timescale='daily')
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
Пример #57
0
 def test_crawler_runner_bootstrap_successful_for_several(self):
     runner = CrawlerRunner()
     yield runner.crawl(NoRequestsSpider)
     yield runner.crawl(NoRequestsSpider)
     self.assertEqual(runner.bootstrap_failed, False)
Пример #58
0
def runSpider():
    # initalize crawler with current project settings
    crawler = CrawlerRunner(get_project_settings())
    # crawl the mimgspider and pass in the filename below for start url config
    crawler.crawl('mimgspider', 'C:/Users/kimbe/Documents/15112/TermProject/urls.txt')  
Пример #59
0
 def test_crawler_runner_accepts_dict(self):
     runner = CrawlerRunner({'foo': 'bar'})
     self.assertEqual(runner.settings['foo'], 'bar')
     self.assertOptionIsDefault(runner.settings, 'RETRY_ENABLED')
Пример #60
0
class Scrape:
    """
            Class that represents connection between flask and scrapy.

     * run crawler in twisted reactor synchronously
     * Initialize CrawlRunner()

    """
    crawl_runner = CrawlerRunner()
    dict_of_spiders = {}

    def scrape(self, domain, dict_of_spiders):
        """
                run crawler in twisted reactor synchronously.

        :param domain:  the list of domains
        :param dict_of_spiders:{'springer': SpringerDoi, 'wiley': WileyDoi, 'ieee': IeeeDoi}

        """
        domains = domain
        self.dict_of_spiders = dict_of_spiders

        for domain in domains:
            try:
                self.scrape_with_crochet(domain).wait(timeout=5)
            except crochet.TimeoutError:
                self.scrape_with_crochet(domain).cancel()
                raise

    @crochet.run_in_reactor
    def scrape_with_crochet(self, domain):
        """
                signal fires when single item is processed and calls _crawler_result to save that item.

        Consider some synchronous do-one-thing-after-the-other application code that wants to use event-driven Twisted-using code.
        We have two threads at a minimum: the application thread(s) and the reactor thread. There are also multiple layers
        of code involved in this interaction

        Twisted code: Should only be called in reactor thread. This may be code from the Twisted package itself, or more
        likely code you have written that is built on top of Twisted.

        @wait_for/@run_in_reactor wrappers: The body of the functions runs in the reactor thread... but the caller
        should be in the application thread.

        The application code: Runs in the application thread(s), expects synchronous/blocking calls.
        dispatcher.connect will connect to the dispatcher that will kind of loop the code between these two functions.
        crawl_runner.crawl will connect to the our particular spider function based on the domain name,
        in our scrapy file and after each yield will pass to the crawler_result function.
        The setting.py is applied to the crawl runner.

        :param domain: the domain to crawl
        :return: a twisted.internet.defer.Deferred

        """
        configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
        crawler_settings = Settings()
        crawler_settings.setmodule(sets)
        self.crawl_runner.settings = crawler_settings
        dispatcher.connect(self._crawler_result, signal=signals.item_scraped)

        for i in self.dict_of_spiders:
            if i in domain:
                eventual = self.crawl_runner.crawl(self.dict_of_spiders[i], category=domain)
                return eventual

    def _crawler_result(self, item, response, spider):
        """
           A callback that is fired after the scrape has completed.
           The scraped results are saved to Database.

        :param item: The items scraped from the website
           """
        database.save(dict(item))