def test_priorization(self):
        webdriver = Mock()
        settings = self.settings(WEBDRIVER_BROWSER=webdriver)
        webdriver.get.side_effect = self._wait
        webdriver.page_source = u''

        dispatcher.connect(self._stop_reactor, signal=signals.spider_closed)

        crawler = Crawler(Settings(values=settings))
        crawler.configure()
        spider = self.Spider(name='test', domain='testdomain')
        crawler.crawl(spider)
        crawler.start()
        log.start(loglevel='ERROR')
        reactor.run()

        assert webdriver.get.mock_calls == [
            call('http://testdomain/path?wr=0'),
            call('http://testdomain/path?wr=0&wa=0'),
            call('http://testdomain/path?wr=0&wa=1'),
            call('http://testdomain/path?wr=1'),
            call('http://testdomain/path?wr=1&wa=0'),
            call('http://testdomain/path?wr=1&wa=1'),
            call('http://testdomain/path?wr=0&wa=0&wr=0'),
            call('http://testdomain/path?wr=0&wa=1&wr=0'),
            call('http://testdomain/path?wr=1&wa=0&wr=0'),
            call('http://testdomain/path?wr=1&wa=1&wr=0')
        ]
 def setUp(self):
     """Initialize the test."""
     crawler = Crawler(CrawlerSettings(settings))
     crawler.configure()
     self.spider = ebird_spider.EBirdSpider('REG')
     self.spider.set_crawler(crawler)
     self.requests = self.spider.start_requests()
Пример #3
0
 def execute(self):
     start_time = datetime.now()
     date_columns = {
         "weekend": dh.next_friday(),
         "today": datetime.now(),
         "autumn": dh.autumn_date(),
         "summer": dh.summer_date(),
         "spring": dh.spring_date(),
         "winter": dh.winter_date(),
         "new_year": dh.new_year_date(),
         "one_month": dh.current_date_plus_months(1),
         "three_months": dh.current_date_plus_months(3),
         "five_months": dh.current_date_plus_months(5)
     }
     spiders_executor = CrawlerProcess()
     for process_name, date in date_columns.items():
         booking_crawler = Crawler(bs, get_project_settings())
         booking_crawler.signals.connect(self.spider_done,
                                         signals.spider_closed)
         spiders_executor.crawl(booking_crawler,
                                column_name=process_name,
                                date=date,
                                city=self.__city)
         self.RUNNING_SPIDERS.append(booking_crawler.spider)
     if len(spiders_executor.crawlers) < len(date_columns):
         print("Less crawlers than date_columns")
         self.execute()
     spiders_executor.start()
     print("Scrapping {} in {}s".format(
         self.__city, str((datetime.now() - start_time).seconds)))
Пример #4
0
def prepare_callback_replay(fixture_path, encoding="utf-8"):
    with open(str(fixture_path), 'rb') as f:
        raw_data = f.read()

    fixture_info = unpickle_data(decompress_data(raw_data), encoding)
    if 'fixture_version' in fixture_info:
        encoding = fixture_info['encoding']
        data = unpickle_data(fixture_info['data'], encoding)
    else:
        data = fixture_info  # legacy tests

    settings = get_project_settings()

    spider_name = data.get('spider_name')
    if not spider_name:  # legacy tests
        spider_name = os.path.basename(
            os.path.dirname(os.path.dirname(fixture_path)))

    spider_cls = get_spider_class(spider_name, settings)
    spider_cls.update_settings(settings)

    for k, v in data.get('settings', {}).items():
        settings.set(k, v, 50)

    crawler = Crawler(spider_cls, settings)
    spider_args_in = data.get('spider_args', data.get('spider_args_in', {}))
    spider = spider_cls.from_crawler(crawler, **spider_args_in)
    crawler.spider = spider

    return data, crawler, spider, settings
Пример #5
0
 def __init__(self, spider):
     Process.__init__(self)
     self.crawler = Crawler()
     self.crawler.configure()
     self.crawler.signals.connect(reactor.stop,
                                  signal=signals.spider_closed)
     self.spider = spider
Пример #6
0
 def __init__(self, splash_url, crawler_depth_limit):
     self.process = CrawlerProcess({'LOG_ENABLED': False})
     self.crawler = Crawler(
         self.TorSplashSpider, {
             'USER_AGENT':
             'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0',
             'SPLASH_URL': splash_url,
             'ROBOTSTXT_OBEY': False,
             'DOWNLOADER_MIDDLEWARES': {
                 'scrapy_splash.SplashCookiesMiddleware':
                 723,
                 'scrapy_splash.SplashMiddleware':
                 725,
                 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware':
                 810,
             },
             'SPIDER_MIDDLEWARES': {
                 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
             },
             'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
             'HTTPERROR_ALLOW_ALL': True,
             'RETRY_TIMES': 2,
             'CLOSESPIDER_PAGECOUNT': 50,
             'DEPTH_LIMIT': crawler_depth_limit
         })
Пример #7
0
 def __init__(self,
              splash_url,
              useragent,
              depth=1,
              log_enabled=False,
              log_level='WARNING'):
     self.process = CrawlerProcess({'LOG_ENABLED': log_enabled})
     self.crawler = Crawler(
         self.ScrapySplashWrapperSpider, {
             'LOG_ENABLED': log_enabled,
             'LOG_LEVEL': log_level,
             'USER_AGENT': useragent,
             'SPLASH_URL': splash_url,
             'DOWNLOADER_MIDDLEWARES': {
                 'scrapy_splash.SplashCookiesMiddleware':
                 723,
                 'scrapy_splash.SplashMiddleware':
                 725,
                 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware':
                 810,
             },
             'SPIDER_MIDDLEWARES': {
                 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
                 'scrapysplashwrapper.ScrapySplashWrapperDepthMiddleware':
                 110
             },
             'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
             'DEPTH_LIMIT': depth
         })
Пример #8
0
    def crawl(cls, sites):
        stat = {"spiders": 0}

        def soft_stop_reactor():
            stat["spiders"] -= 1
            if not stat["spiders"]:
                reactor.stop()

        for site in sites:
            try:
                spider = site.parser.spider(site)
            except (NotImplementedError, ObjectDoesNotExist):
                logger.error(
                    _('Spider not implemented for "%s" site', site.label))
            else:
                stat["spiders"] += 1
                with spider_project(spider) as settings:
                    crawler = Crawler(settings)
                    crawler.signals.connect(
                        soft_stop_reactor,
                        signal=signals.spider_closed)  # reactor.stop
                    crawler.configure()
                    crawler.crawl(spider)
                    crawler.start()

        logfile = open('crawl.log', 'w')
        log_observer = log.ScrapyFileLogObserver(logfile, level=logging.INFO)
        log_observer.start()

        # the script will block here until the spider_closed signal was sent
        reactor.run()
Пример #9
0
def main():
    """Setups item signal and run the spider"""
    from twisted.internet import reactor
    from scrapy import signals
    from scrapy.settings import Settings
    from scrapy.crawler import Crawler

    def catch_item(sender, item, **kwargs):
        print "Got:", item

    settings = Settings()

    # set up crawler
    crawler = Crawler(settings)
    # shut off log
    crawler.settings.set('LOG_ENABLED', False, priority='cmdline')
    # set up signal to catch items scraped
    crawler.signals.connect(catch_item,   signal=signals.item_passed)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)

    crawler.install()
    crawler.configure()

    # schedule spider
    spider = MySpider()
    crawler.crawl(spider)

    # start engine scrapy/twisted
    print "STARTING ENGINE"
    crawler.start()
    reactor.run()
    print "ENGINE STOPPED"
 def setUp(self):
     """Initialize the test."""
     settings.LOG_LEVEL = 'DEBUG'
     crawler = Crawler(CrawlerSettings(settings))
     crawler.configure()
     self.spider = ebird_spider.EBirdSpider('REG')
     self.spider.set_crawler(crawler)
Пример #11
0
def test_start_requests_http_error(spider_name):
    # See scrapy.crawler.CrawlerRunner._create_crawler
    spidercls = runner.spider_loader.load(spider_name)
    crawler = Crawler(spidercls, runner.settings)
    start_time = datetime(2001, 2, 3, 4, 5, 6)
    crawler.stats.set_value('start_time', start_time)

    try:
        # See scrapy.crawler.Crawler._create_spider
        spider = crawler.spidercls.from_crawler(crawler)

        for request in spider.start_requests():
            # See scrapy.core.scraper.Scraper.call_spider
            callback = request.callback or spider.parse

            response = Response('http://example.com',
                                status=555,
                                request=request)
            # If `max_retries` is set, the spider handles (and retries) error responses.
            if hasattr(spider, 'max_retries'):
                response.request.meta['retries'] = spider.max_retries
            items = list(callback(response))

            assert len(items) == 1
            for item in items:
                assert type(item) is FileError
                assert len(item) == 3
                assert item['errors'] == {'http_code': 555}
                assert item['file_name']
                assert item['url']
    except MissingEnvVarError as e:
        warnings.warn(f'{spidercls.name}: {e}')
Пример #12
0
 def setup(self):
     settings = get_project_settings()
     crawler = Crawler(settings)
     crawler.configure()
     crawler.signals.connect(self._next_crawl, signal=signals.spider_closed)
     crawler.crawl(self.spider)
     crawler.start()
Пример #13
0
    def kickoff(self):
        """
        Starts a new crawler
        :return: 
        """
        settings = Settings()

        # settings.set("USER_AGENT", "Test")
        settings.set('JOBDIR', self.args.data_dir)
        self.spider = MavenDataSpider()

        # Wrap with crawler, configure
        crawler = Crawler(self.spider, settings)
        crawler.signals.connect(spider_closing, signal=signals.spider_closed)

        logger.info('Starting crawler')
        crawler.crawl(self.spider, app=self, dbsess=self.session)

        self.spider = crawler.spider
        self.spider.link_queue_mode = False
        if self.args.debug:
            coloredlogs.install(level=logging.DEBUG)

        # Keeping thread working
        reactor.run()
Пример #14
0
def results():
    """Return results generator from the arxiv spider. All fields, one record.
    """
    def _get_record_from_processed_item(item, spider):
        crawl_result = pipeline.process_item(item, spider)
        validate(crawl_result['record'], 'hep')
        assert crawl_result
        return crawl_result['record']

    crawler = Crawler(spidercls=arxiv_spider.ArxivSpider)
    spider = arxiv_spider.ArxivSpider.from_crawler(crawler)
    fake_response = fake_response_from_file(
        'arxiv/sample_arxiv_record0.xml',
        response_type=TextResponse,
    )
    test_selectors = fake_response.xpath('.//record')
    parsed_items = [spider.parse_record(sel) for sel in test_selectors]

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield [
        _get_record_from_processed_item(parsed_item, spider)
        for parsed_item in parsed_items
    ]

    clean_dir()
Пример #15
0
def generated_conference_paper(scrape_pos_conference_paper_page_body):
    """Return results generator from the PoS spider."""
    # environmental variables needed for the pipelines payload
    os.environ['SCRAPY_JOB'] = 'scrapy_job'
    os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
    os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'

    crawler = Crawler(spidercls=pos_spider.POSSpider)
    spider = pos_spider.POSSpider.from_crawler(crawler)
    request = next(
        spider.parse(
            fake_response_from_file(
                file_name=str('pos/sample_pos_record.xml'), )))
    response = HtmlResponse(url=request.url,
                            request=request,
                            body=scrape_pos_conference_paper_page_body,
                            **{'encoding': 'utf-8'})
    assert response

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)
    parsed_item = next(request.callback(response))
    crawl_result = pipeline.process_item(parsed_item, spider)
    assert crawl_result['record']

    yield crawl_result['record']

    clean_dir()
Пример #16
0
def call_spider(file):
    """
    Crea el spider y ejecuta el reactor. Copia los resultados del crawling a los archivos .json para luego
    transformarlos a los archivos data.json correspondientes.
    """
    with open(file, "r") as f:
        list_url = f.readlines()
        domains = []
        urls = []
        created_files = []
        for u in list_url:
            domain = u.strip('\n')
            url_aux = domain.split("/")
            domain_type = False
            if (len(url_aux) > 1):
                domain = url_aux[0]
                url = "http://" + url_aux[0] + "/datos/data"
                if domain == 'www.paraguay.gov.py':
                    url = "http://" + url_aux[0] + "/datos"
            else:
                url = "http://" + u.strip('\n') + "/data"
                domain_type = True
            print "============= Domain " + domain
            print "============= Start url " + url
            response = requests.get(url + "/data.json")
            if response.status_code == 200:
                filename = FileController.FileController(
                ).save_existing_data_json(response, domain, True)
                created_files.append({
                    'modalidad': 'recolecta',
                    'archivo': filename
                })
            else:
                domains.append(domain)
                urls.append(url)

        spider = DataSpider(domains=domains,
                            start_urls=urls,
                            domain_type=domain_type)
        settings = get_project_settings()
        crawler = Crawler(settings)
        crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
        crawler.configure()
        crawler.crawl(spider)
        crawler.start()
        log.start(logfile="log.txt", loglevel=log.DEBUG, logstdout=False)
        reactor.run()  # the script will block here
        """ Copiar los datos a los archivos .json """
        data_spider.copy_items_to_files()
        """ Eliminar archivos temporales """
        FileController.FileController().clean_tmp_files()
        """ Convertir los archivos .json a data.json (formato POD) """
        for domain in domains:
            filename = DataJson.DataJson().convert(domain)
            created_files.append({
                'modalidad': 'data-hunting',
                'archivo': filename
            })

        return created_files
Пример #17
0
 def create_crawler2(self, spidercls_cls, spidercls_str):
     cls_settings = self.settings.copy()
     spidercls_info = self.get_spidercls_info(spidercls_str)
     extra_info = {'project_id': spidercls_info.get('project_id')}
     cls_settings.setdict(extra_info)
     logger.info(cls_settings)
     return Crawler(spidercls_cls, cls_settings)
Пример #18
0
 def test_not_enabled(self):
     settings: Settings = Settings({'HTTPPROXY_ENABLED': False})
     crawler: Crawler = Crawler(_spider, settings)
     self.assertRaises(
         NotConfigured,
         partial(HttpProxyMiddleware.from_crawler, crawler)
     )
Пример #19
0
def parse_careers(spider):
    crawler = Crawler(Settings())
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start()
    spider.start()
Пример #20
0
def crawl():
    for spider_test in spider_tests:
        print("Running spider: %s" %
              spider_test[SETTING]['HTTPCACHE_DIR'].split('/')[-1])
        crawler = Crawler(spider_test[SPIDER], spider_test[SETTING])
        yield runner.crawl(crawler)
    reactor.stop()
Пример #21
0
    def handle(self, url_slug, **options):
        page = Page.objects.get(url_slug=url_slug)
        feed = page.feed
        store = page.store
        store_slug = store.slug.lower()
        opts = {
            'recreate_tiles': options['recreate_tiles'],
            'skip_images': not options['update_images'],
            'skip_tiles': True,
        }

        start_urls = []
        for tile in feed.tiles.all():
            if tile.product:
                start_urls.append(tile.product.url)
            for content in tile.content.all():
                for prod in content.tagged_products.all():
                    start_urls.append(prod.url)
        start_urls = set(start_urls)

        # set up standard framework for running spider in a script
        settings = get_project_settings()
        crawler = Crawler(settings)
        crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
        crawler.configure()

        spider = crawler.spiders.create(store_slug, **opts)
        spider.start_urls = start_urls
        spider.feed_id = feed.id

        crawler.crawl(spider)
        logging.info('Starting spider with options: {}'.format(opts))
        crawler.start()

        reactor.run()
Пример #22
0
 def _setup(self, project):
     spider = crawlspider.LinkSpider(project)
     settings = get_project_settings()
     crawler = Crawler(settings)
     crawler.configure()
     crawler.crawl(spider)
     self.add_crawler()
Пример #23
0
    def setup(self):
        logging.disable(logging.DEBUG)

        # Setting pipeline
        settings = {
            "DB_HOST": os.getenv("DB_HOST"),
            "DB_PORT": os.getenv("DB_PORT"),
            "DB_DATABASE": os.getenv("DB_DATABASE"),
            "DB_USERNAME": os.getenv("DB_USERNAME"),
            "DB_PASSWORD": os.getenv("DB_PASSWORD"),
        }
        crawler = Crawler(HorseRacingSpider, settings)
        self.pipeline = PostgreSQLPipeline.from_crawler(crawler)

        # Setting db
        with self.pipeline.session() as sess:
            sess.query(RaceInfoData).delete()
            sess.query(RacePayoffData).delete()
            sess.query(RaceResultData).delete()
            sess.query(RaceDenmaData).delete()
            sess.query(HorseData).delete()
            sess.query(TrainerData).delete()
            sess.query(JockeyData).delete()
            sess.query(OddsWinPlaceData).delete()
            sess.query(OddsBracketQuinellaData).delete()
            sess.query(OddsExactaData).delete()
            sess.query(OddsQuinellaData).delete()
            sess.query(OddsQuinellaPlaceData).delete()
            sess.query(OddsTrifectaData).delete()
            sess.query(OddsTrioData).delete()
            sess.commit()

        self.sess = self.pipeline.session()
Пример #24
0
def setup_crawler(spider, stop=False):
    '''
    Takes a spider class object
    '''
    # Deferred means other functions can wait on this finishing
    # Wait until the callback is triggered by spider close
    # See twisted docs
    d = defer.Deferred()

    def foo(*a, **kw):
        # The result to be passed to any callbacks to deferred
        # (we don't use it, so True could've been False, None w/e)
        d.callback(True)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    # Ref to foo otherwise it gets GC'd (garbage collected)
    crawler._tempref = foo
    # foo is the handler for the closed signal from this spider
    # N.B. dispatch returns spider and reason (e.g. 'finished') to foo.
    crawler.signals.connect(foo, signal=signals.spider_closed)
    crawler.crawl(spider)
    # N.B log is scrapy log. log2 is python color logger
    # The crawler arg is necessary for log_count/{ERROR, DEBUG, INFO..} stats
    # which you will want for stats mailer extension.
    # Starting this each time will cause the big torrade of ESMTP Error
    # log.start(crawler=crawler)
    crawler.start()
    return d
Пример #25
0
 def _crawl_next(self, spider):
     settings = get_project_settings()
     crawler = Crawler(settings)
     crawler.configure()
     crawler.signals.connect(self._done_task, signal=signals.spider_closed)
     crawler.crawl(spider)
     crawler.start()
Пример #26
0
 def __init__(self, splash_url, crawler_options):
     self.process = CrawlerProcess({'LOG_ENABLED': True})
     self.crawler = Crawler(
         self.TorSplashSpider,
         {
             'USER_AGENT':
             crawler_options['user_agent'],  # /!\ overwritten by lua script
             'SPLASH_URL': splash_url,
             'ROBOTSTXT_OBEY': False,
             'DOWNLOADER_MIDDLEWARES': {
                 'scrapy_splash.SplashCookiesMiddleware':
                 723,
                 'scrapy_splash.SplashMiddleware':
                 725,
                 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware':
                 810,
                 'scrapy_splash.SplashDeduplicateArgsMiddleware':
                 100,
             },
             'SPIDER_MIDDLEWARES': {
                 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
             },
             'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
             'HTTPERROR_ALLOW_ALL': True,
             'RETRY_TIMES': 2,
             'CLOSESPIDER_PAGECOUNT':
             crawler_options['closespider_pagecount'],
             'DEPTH_LIMIT': crawler_options['depth_limit'],
             'SPLASH_COOKIES_DEBUG': False
         })
Пример #27
0
 def __init__(self, spider):
     Process.__init__(self)
     settings = get_project_settings()
     self.crawler = Crawler(spider.__class__, settings)
     self.crawler.signals.connect(reactor.stop,
                                  signal=signals.spider_closed)
     self.spider = spider
Пример #28
0
def setup_crawler(domain):
    spider = FollowAllSpider(domain=domain)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
Пример #29
0
    def test_spider_custom_settings_log_level(self):
        log_file = self.mktemp()
        class MySpider(scrapy.Spider):
            name = 'spider'
            custom_settings = {
                'LOG_LEVEL': 'INFO',
                'LOG_FILE': log_file,
                # disable telnet if not available to avoid an extra warning
                'TELNETCONSOLE_ENABLED': telnet.TWISTED_CONCH_AVAILABLE,
            }

        configure_logging()
        self.assertEqual(get_scrapy_root_handler().level, logging.DEBUG)
        crawler = Crawler(MySpider, {})
        self.assertEqual(get_scrapy_root_handler().level, logging.INFO)
        info_count = crawler.stats.get_value('log_count/INFO')
        logging.debug('debug message')
        logging.info('info message')
        logging.warning('warning message')
        logging.error('error message')

        with open(log_file, 'rb') as fo:
            logged = fo.read().decode('utf8')

        self.assertNotIn('debug message', logged)
        self.assertIn('info message', logged)
        self.assertIn('warning message', logged)
        self.assertIn('error message', logged)
        self.assertEqual(crawler.stats.get_value('log_count/ERROR'), 1)
        self.assertEqual(crawler.stats.get_value('log_count/WARNING'), 1)
        self.assertEqual(
            crawler.stats.get_value('log_count/INFO') - info_count, 1)
        self.assertEqual(crawler.stats.get_value('log_count/DEBUG', 0), 0)
 def setUp(self):
     self.spider = Spider("foo")
     settings = Settings()
     settings.setmodule(default_settings)
     self.crawler = Crawler(Spider, settings)
     self.mw = CookiesMiddleware.from_crawler(self.crawler)
     self.mw.spider_opened(self.spider)