def setUp(self): """Initialize the test.""" settings.LOG_LEVEL = 'DEBUG' crawler = Crawler(CrawlerSettings(settings)) crawler.configure() self.spider = ebird_spider.EBirdSpider('REG') self.spider.set_crawler(crawler)
def setup_crawler(self, spider): crawler = Crawler(get_project_settings()) crawler.signals.connect(self.spider_closed, signals.spider_closed) crawler.configure() crawler.crawl(spider) self.crawler = crawler self.crawler.start()
def _setup(self, project): spider = crawlspider.LinkSpider(project) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) self.add_crawler()
def setUp(self): """Initialize the test.""" crawler = Crawler(CrawlerSettings(settings)) crawler.configure() self.spider = ebird_spider.EBirdSpider('REG') self.spider.set_crawler(crawler) self.requests = self.spider.start_requests()
def handle(self, *args, **options): self.stdout.write('Start') spider = LinuxFoundationSpider(year=options.get('year')) crawler = Crawler(spider, settings.SPIDER_SETTINGS) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.crawl() reactor.run() # the script will block here until the spider_closed signal is sent'''
def run(self): crawler = Crawler(get_project_settings()) crawler.configure() log.start() for spiderName in crawler.spiders.list(): self.spiderCounter += 1 self.setupCrawler(spiderName) reactor.run()
class listPageSpiderService(service.Service): def __init__(self, parent): self.spiderService = parent self._crawler = Crawler(settings) self._crawler.configure() self._spider = listPageSpider(taskId=self.spiderService.taskId) def getStats(self): return self._crawler.stats.get_stats() def startService(self): service.Service.startService(self) self._crawler.signals.connect(self.stopService, signals.spider_closed) #_listPageSpider = listPageSpider(taskId=self.spiderService.taskId) # self._crawler.start() def startCrawl(self): print '------------->listPageSpiderService->startCrawl' if self._crawler._spider is None: self._crawler.crawl(self._spider) else: print '>>>>>>>>>>>>>>>>>>>>>',self._crawler._spider if not self._crawler.engine.running: print '>>>>>>>>>>>>>>>>>>>>> _crawler.engine.running' self._crawler.start() else: if self._crawler.engine.paused : print '>>>>>>>>>>>>>>>>>>>>> _crawler.engine.unpause' if self._crawler._spider is not None: print '>>>>>>>>>>>>>>>>>>>>> _crawler._spider.start_requests()' self._crawler._spider.start_requests() self._crawler.engine.unpause() def pausedCrawl(self): print 'listPageSpiderService->pausedCrawl' if self._crawler._spider is not None: if not self.spiderService._startPageSpiderService._crawler.engine.running: print '------------------->_crawler.stop()' self._crawler.stop() else: if not self._crawler.engine.paused : self._crawler.engine.pause() #if self._crawler.engine.running : #if not self._crawler.engine.paused : #print '?????????????????????????', 'pausedCrawl' #self._crawler.engine.pause() def stopService(self): log.msg(format='listPageSpiderService->stopService stop listPageSpiderService serviceName=(%(serviceName)s)',serviceName=self.name) service.Service.stopService(self) self.spiderService.removeSpiderService() self._crawler._spider.stopSpider() self._crawler.stop() if self.name in self.spiderService.namedServices: self.spiderService.removeService(self)
def setUp(self): """Initialize the test.""" crawler = Crawler(CrawlerSettings(settings)) crawler.configure() self.spider = ebird_spider.EBirdSpider('REG') self.spider.set_crawler(crawler) self.spider.start_requests() self.records = [{ 'checklistID': 'CL00001', 'comName': 'Common Name', 'countryCode': 'CC', 'countryName': 'Country', 'firstName': 'Name', 'howMany': 1, 'lastName': 'Surname', 'lat': 45.000000, 'lng': -45.000000, 'locID': 'L0000001', 'locName': 'Location 1', 'locationPrivate': True, 'obsDt': '2013-03-27 09:00', 'obsID': 'OBS0000001', 'obsReviewed': False, 'obsValid': True, 'presenceNoted': False, 'sciName': 'Scientific Name', 'subID': 'S0000001', 'subnational1Code': 'SN-01', 'subnational1Name': 'Region', 'subnational2Code': 'SN-02', 'subnational2Name': 'County', }, { 'checklistID': 'CL00002', 'comName': 'Common Name', 'countryCode': 'CC', 'countryName': 'Country', 'firstName': 'Name', 'howMany': 1, 'lastName': 'Surname', 'lat': 50.000000, 'lng': -50.000000, 'locID': 'L0000002', 'locName': 'Location 2', 'locationPrivate': True, 'obsDt': '2013-03-27 10:00', 'obsID': 'OBS0000002', 'obsReviewed': False, 'obsValid': True, 'presenceNoted': False, 'sciName': 'Scientific Name', 'subID': 'S0000002', 'subnational1Code': 'SN-01', 'subnational1Name': 'Region', 'subnational2Code': 'SN-02', 'subnational2Name': 'County', }]
def test_scrapy_spider(): settings = Settings() settings.setmodule("tests.scrapy_spider.settings") crawler = Crawler(MySpider, settings=settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.crawl() reactor.run() stats = crawler.stats.spider_stats["example"] assert stats["frontera/crawled_pages_count"] == 5 assert crawler.spider.callback_calls > 0
def main(): """Setups item signal and run the spider""" from twisted.internet import reactor from scrapy import signals from scrapy.settings import Settings from scrapy.crawler import Crawler def catch_item(sender, item, **kwargs): print "Got:", item settings = Settings() # set up crawler crawler = Crawler(settings) # shut off log crawler.settings.set('LOG_ENABLED', False, priority='cmdline') # set up signal to catch items scraped crawler.signals.connect(catch_item, signal=signals.item_passed) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.install() crawler.configure() # schedule spider spider = MySpider() crawler.crawl(spider) # start engine scrapy/twisted print "STARTING ENGINE" crawler.start() reactor.run() print "ENGINE STOPPED"
def start(self): settings = Settings() # crawl responsibly settings.set("USER_AGENT", "test") crawler_obj = Spider() crawler = Crawler(crawler_obj, settings) # stop reactor when spider closes crawler.signals.connect(self.stop, signal=signals.spider_closed) crawler.crawl()
def test_skip_parsing_webpages(self): """Verify no web requests are made if include_html is False.""" crawler = Crawler(CrawlerSettings(settings)) crawler.configure() spider = ebird_spider.EBirdSpider('REG') spider.set_crawler(crawler) spider.start_requests() spider.include_html = False response = response_for_data(self.records) results = spider.parse_locations(response) self.assertEqual(0, sum(1 for _ in results))
def test_scrapy_spider(seeds_file, db_file): fs = FronteraSettings(module="tests.scrapy_spider.frontera.settings") add_seeds.run_add_seeds(fs, seeds_file) settings = ScrapySettings() settings.setmodule("tests.scrapy_spider.settings") crawler = Crawler(MySpider, settings=settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.crawl() reactor.run() stats = crawler.stats.spider_stats['example'] assert stats['frontera/crawled_pages_count'] == 5 assert crawler.spider.callback_calls > 0
class CrawlerScript(Process): def __init__(self, spider): Process.__init__(self) # settings = get_project_settings() self.crawler = Crawler(spider, settings) # self.crawler.configure() self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed) self.spider = spider def run(self): self.crawler.crawl() # self.crawler.start() reactor.run()
class startPageSpiderService(service.Service): def __init__(self, parent): self.spiderService = parent self._crawler = Crawler(settings) self._crawler.configure() self._spider = startPageSpider(taskId=self.spiderService.taskId) def getStats(self): return self._crawler.stats.get_stats() def startService(self): service.Service.startService(self) #dispatcher.connect(self.stopService, signals.spider_closed) self._crawler.signals.connect(self.stopService, signals.spider_closed) # self._crawler.signals.connect(self.test2, 'writeListQuque') #_startPageSpider = startPageSpider(taskId=self.spiderService.taskId) self._crawler.crawl(self._spider) #self._crawler.start() self.startCrawl() def startCrawl(self): if not self._crawler.engine.running: self._crawler.start() # def test2(self): # print '================>111111111111111111111111<==========================' def stopService(self): log.msg(format='startPageSpiderService->stopService stop startPageSpiderService serviceName=(%(serviceName)s)',serviceName=self.name) service.Service.stopService(self) self.spiderService.removeSpiderService() self._crawler.stop() if self.name in self.spiderService.namedServices: self.spiderService.removeService(self)
def do_parse_test(html, n): start = time.time() spider = BenchmarkSpider(name="benchmark", start_urls=[html]) crawler = Crawler(Settings(values={"TELNETCONSOLE_PORT": None})) crawler.configure() crawler.crawl(spider) for i in xrange(n): crawler.start() crawler.stop() stop = time.time() print stop - start, "s"
class JobCrawlerScript(Process): def __init__(self, spider,key_word,crawl_num,n_crawls): Process.__init__(self) settings = get_project_settings() self.spider = spider self.crawler = Crawler(spider.__class__, settings) # self.crawler.configure() self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed) self.n_crawls = n_crawls self.crawl_num = crawl_num self.key_word = key_word def run(self): self.crawler.crawl(self.spider,key_word=self.key_word,crawl_num=self.crawl_num,n_crawls=self.n_crawls) reactor.run()
def __init__(self, spider): Process.__init__(self) settings = get_project_settings() self.crawler = Crawler(settings) self.crawler.configure() self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed) self.spider = spider
def __init__(self, spider): Process.__init__(self) setting = Settings() setting.setmodule(s) self.crawler = Crawler(setting) self.crawler.configure() self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed) self.spider = spider
def __init__(self, settings): super(Scrapy, self).__init__() self.settings = settings self.spider = GamepediaSpider() self.crawler = Crawler(self.settings) self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed) # @UndefinedVariable self.crawler.configure() self.crawler.crawl(self.spider)
def __crawl(self, hiddenWebSite, localPort, extraPath='', crawlImages=True, crawlLinks=True,crawlContents=True, crawlFormData=True): def catch_item(sender, item, **kwargs): item['url'] = item['url'].replace('http://127.0.0.1:'+str(localPort)+extraPath, hiddenWebSite) print "[+]Processing URL %s ... " %(item['url']) from core.tortazo.databaseManagement.TortazoDatabase import TortazoDatabase database = TortazoDatabase() database.initDatabaseDeepWebCrawlerPlugin() self.__processPage(item, database) # setup crawler dispatcher.connect(catch_item, signal=signals.item_passed) dispatcher.connect(reactor.stop, signal=signals.spider_closed) settings = get_project_settings() settings.set('ITEM_PIPELINES', {'scrapy.contrib.pipeline.images.ImagesPipeline': 1}, priority='cmdline') settings.set('IMAGES_STORE', config.deepWebCrawlerOutdir+hiddenWebSite) crawler = Crawler(settings) crawler.configure() spider = HiddenSiteSpider("http://127.0.0.1:"+str(localPort)+extraPath, hiddenWebSite, self.extractorRules) spider.setImages(crawlImages) spider.setLinks(crawlLinks) spider.setContents(crawlContents) spider.setForms(crawlFormData) crawler.crawl(spider) print "\n[+] Starting scrapy engine... this process could take some time, depending on the crawling and extractor rules applied... \n" crawler.start() reactor.run() print "[+] Crawler finished."
def setup_crawler( spider_class, **kwargs ): """ Use scrapy in a script see http://doc.scrapy.org/en/latest/topics/practices.html :param spider_class: Spider class to test :type spider_class: text """ def add_item(item): items.append(item) items = [] # create Crawler settings = get_project_settings() crawler = Crawler(settings) crawler.configure() # connect collecting function on item_passed crawler.signals.connect(add_item, signals.item_passed) # create & connect spider spider = spider_class(**kwargs) crawler.crawl(spider) # start crawler log.start() crawler.start() # run crawler task.deferLater(reactor, 1, reactor.stop) reactor.run() return items
def setup_crawler(ticker): spider = StatsSpider(ticker=ticker) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start()
def crawl(cls, sites): stat = {"spiders": 0} def soft_stop_reactor(): stat["spiders"] -= 1 if not stat["spiders"]: reactor.stop() for site in sites: try: spider = site.parser.spider(site) except (NotImplementedError, ObjectDoesNotExist): logger.error(_('Spider not implemented for "%s" site', site.label)) else: stat["spiders"] += 1 with spider_project(spider) as settings: crawler = Crawler(settings) crawler.signals.connect(soft_stop_reactor, signal=signals.spider_closed) # reactor.stop crawler.configure() crawler.crawl(spider) crawler.start() logfile = open('crawl.log', 'w') log_observer = log.ScrapyFileLogObserver(logfile, level=logging.INFO) log_observer.start() # the script will block here until the spider_closed signal was sent reactor.run()
def test_priorization(self): webdriver = Mock() settings = self.settings(WEBDRIVER_BROWSER=webdriver) webdriver.get.side_effect = self._wait webdriver.page_source = u'' dispatcher.connect(self._stop_reactor, signal=signals.spider_closed) crawler = Crawler(Settings(values=settings)) crawler.configure() spider = self.Spider(name='test', domain='testdomain') crawler.crawl(spider) crawler.start() log.start(loglevel='ERROR') reactor.run() assert webdriver.get.mock_calls == [ call('http://testdomain/path?wr=0'), call('http://testdomain/path?wr=0&wa=0'), call('http://testdomain/path?wr=0&wa=1'), call('http://testdomain/path?wr=1'), call('http://testdomain/path?wr=1&wa=0'), call('http://testdomain/path?wr=1&wa=1'), call('http://testdomain/path?wr=0&wa=0&wr=0'), call('http://testdomain/path?wr=0&wa=1&wr=0'), call('http://testdomain/path?wr=1&wa=0&wr=0'), call('http://testdomain/path?wr=1&wa=1&wr=0')]
def setup_crawler(user, website, validator_set, parameters): spider = WebQualitySpider(user=user, website=website, validators=validator_set, parameters=parameters) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start()
def setup_crawler(domain): spider = FollowAllSpider(domain=domain) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start()
def setup_crawler(): spider = DmmDirectSpider(url=sys.argv[1]) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start()
def setup_crawler(id="550", publisher="rbd"): spider = DmmQuerySpider(id, publisher) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start()
def __init__(self, splash_url, crawler_options): self.process = CrawlerProcess({'LOG_ENABLED': False}) self.crawler = Crawler(self.TorSplashSpider, { 'USER_AGENT': crawler_options['user_agent'], 'SPLASH_URL': splash_url, 'ROBOTSTXT_OBEY': False, 'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, }, 'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,}, 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', 'HTTPERROR_ALLOW_ALL': True, 'RETRY_TIMES': 2, 'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'], 'DEPTH_LIMIT': crawler_options['depth_limit'] })
def test_populate_spidercls_settings(self): spider_settings = {'TEST1': 'spider', 'TEST2': 'spider'} project_settings = {'TEST1': 'project', 'TEST3': 'project'} class CustomSettingsSpider(DefaultSpider): custom_settings = spider_settings settings = Settings() settings.setdict(project_settings, priority='project') crawler = Crawler(CustomSettingsSpider, settings) self.assertEqual(crawler.settings.get('TEST1'), 'spider') self.assertEqual(crawler.settings.get('TEST2'), 'spider') self.assertEqual(crawler.settings.get('TEST3'), 'project') self.assertFalse(settings.frozen) self.assertTrue(crawler.settings.frozen)
def get_crawler(settings_dict=None): """Return an unconfigured Crawler object. If settings_dict is given, it will be used as the settings present in the settings module of the CrawlerSettings. """ from scrapy.crawler import Crawler from scrapy.settings import CrawlerSettings class SettingsModuleMock(object): pass settings_module = SettingsModuleMock() if settings_dict: for k, v in settings_dict.items(): setattr(settings_module, k, v) settings = CrawlerSettings(settings_module) return Crawler(settings)
class UrlCrawlerScript(Process): def __init__(self, spider): Process.__init__(self) settings = get_project_settings() self.crawler = Crawler(settings) self.crawler.configure() # self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed) self.spider = spider def run(self): self.crawler.crawl(self.spider) self.crawler.start()
def run(): log.start(loglevel=log.DEBUG) settings = Settings() # crawl responsibly settings.set( "USER_AGENT", "Gitlaw-ca Scraper (+https://github.com/JasonMWhite/gitlawca-scraper)") settings.set("ITEM_PIPELINES", {'gitlawca.scraper.pipelines.LawscraperPipeline': 100}) crawler = Crawler(settings) # stop reactor when spider closes crawler.signals.connect(spider_closing, signal=signals.spider_closed) crawler.configure() crawler.crawl(CanadaLawSpider()) crawler.start() reactor.run()
def main(): command_line_args = parse_arguments() dispatcher.connect(stop_reactor, signal=signals.spider_closed) spider = goodsmatrix.spider.GoodsMatrixSpider(command_line_args.category) settings = get_project_settings() pipelines_order_dict = { "goodsmatrix.pipelines.postprocessors.UnescapeSpecialHTMLEntities": 2, "goodsmatrix.pipelines.postprocessors.ExtractEsl": 3, "goodsmatrix.pipelines.postprocessors.ExtractEAdditives": 4, "goodsmatrix.pipelines.postprocessors.StripMultilineStringProperties": 5, "goodsmatrix.pipelines.postprocessors.ExtractIngredients": 6, } if command_line_args.persistence: pipelines_order_dict["goodsmatrix.pipelines.writers.PersistentRDFPipeline"] = 10 else: pipelines_order_dict["goodsmatrix.pipelines.writers.InMemoryRDFPipeline"] = 10 if command_line_args.agrovoc_endpoint: settings.set("AGROVOC_ENDPOINT", command_line_args.agrovoc_endpoint) if command_line_args.api_key: pipelines_order_dict["goodsmatrix.pipelines.postprocessors.Translator"] = 7 settings.set("YANDEX_TRANSLATE_API_URI", command_line_args.api_key) settings.set("ITEM_PIPELINES", pipelines_order_dict) if command_line_args.old_endpoint: pipelines_order_dict["goodsmatrix.pipelines.postprocessors.SkipIfExistsInOldGraph"] = 1 settings.set("OLD_ENDPOINT_URI", command_line_args.old_endpoint) settings.set("OUTPUT_FILENAME", command_line_args.output_filename) settings.set("COOKIES_ENABLED", False) settings.set("REDIRECT_ENABLED", False) settings.set("LOG_FORMATTER", "goodsmatrix.spider.PoliteLogFormatter") crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start() log.start(loglevel='INFO') reactor.run() # the script will block here
def test_proxy_auth_encoding(self): # utf-8 encoding os.environ['http_proxy'] = u'https://m\u00E1n:pass@proxy:3128' settings = deepcopy(self.settings) settings.update({'HTTPPROXY_AUTH_ENCODING': 'utf-8'}) crawler = Crawler(spider, settings) mw = HttpProxyMiddleware.from_crawler(crawler) mw.spider_opened(self.spider) cached_proxy_bypass.cache_clear() req = Request('http://scrapytest.org') assert mw.process_request(req, spider) is None self.assertEqual(req.meta, {'proxy': 'https://*****:*****@proxy:3128'}) assert mw.process_request(req, spider) is None self.assertEqual(req.meta, {'proxy': 'https://*****:*****@proxy:3128'}) assert mw.process_request(req, spider) is None self.assertEqual(req.meta, {'proxy': 'https://proxy:3128'}) self.assertEqual(req.headers.get('Proxy-Authorization'), b'Basic /HNlcjpwYXNz') mw.spider_closed(self.spider)
def test_from_crawler_method_should_initialize_the_driver(self): """Test that the ``from_crawler`` method should initialize the selenium driver""" crawler = Crawler( spidercls=self.spider_klass, settings=self.settings ) selenium_middleware = SeleniumMiddleware.from_crawler(crawler) # The driver must be initialized self.assertIsNotNone(selenium_middleware.driver) # We can now use the driver selenium_middleware.driver.get('http://www.python.org') self.assertIn('Python', selenium_middleware.driver.title) selenium_middleware.driver.close()
def record(scrape_pos_page_body): """Return results generator from the PoS spider.""" crawler = Crawler(spidercls=pos_spider.POSSpider) spider = pos_spider.POSSpider.from_crawler(crawler) request = spider.parse( fake_response_from_file('pos/sample_pos_record.xml')).next() response = HtmlResponse(url=request.url, request=request, body=scrape_pos_page_body, **{'encoding': 'utf-8'}) assert response pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) parsed_item = request.callback(response) parsed_record = pipeline.process_item(parsed_item, spider) assert parsed_record return parsed_record
class TechcrunchCrawler(Process): def __init__(self, spider): Process.__init__(self) self.crawler = Crawler() self.crawler.configure() self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed) self.spider = spider def run(self): self.crawler.crawl(self.spider) self.crawler.start() reactor.run()
def handle(self, *args, **options): spider = apd.ApdSpider() settings = Settings() settings.setdict({ 'BOT_NAME': 'CrimeReport', 'USER_AGENT': 'Crime Scraper (+http://www.dailytexanonline.com/)', 'ITEM_PIPELINES': [ 'crimeAPI.scraper.CrimeReport.CrimeReport.pipelines.CrimeReportPipeline' ], }) crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() log.start(loglevel=scrapy.log.INFO) reactor.run()
def start_job(self, job=None): runner = CrawlerRunner() crawler_job = job['crawler_job'] cti_runner = job['runner'] crawler_cls = crawler_job['crawler_cls'] crawler_kwargs = crawler_job['crawler_kwargs'] def engine_stopped_callback(): cti_runner.transform_and_index() crawler = Crawler(crawler_cls, Settings(cti_runner.settings)) crawler.signals.connect(engine_stopped_callback, signals.engine_stopped) runner.crawl(crawler, **crawler_kwargs) """ d = runner.crawl(crawler, **crawler_kwargs) # d.addBoth(engine_stopped_callback) """ reactor.run()
def test_populate_spidercls_settings(self): spider_settings = {'TEST1': 'spider', 'TEST2': 'spider'} project_settings = {'TEST1': 'project', 'TEST3': 'project'} class CustomSettingsSpider(DefaultSpider): custom_settings = spider_settings settings = Settings() settings.setdict(project_settings, priority='project') with warnings.catch_warnings(): warnings.simplefilter("ignore", ScrapyDeprecationWarning) crawler = Crawler(CustomSettingsSpider, settings) self.assertEqual(crawler.settings.get('TEST1'), 'spider') self.assertEqual(crawler.settings.get('TEST2'), 'spider') self.assertEqual(crawler.settings.get('TEST3'), 'project') self.assertFalse(settings.frozen) self.assertTrue(crawler.settings.frozen)
def spider_results(): results = [] def crawler_results(signal, sender, item, response, spider): results.append(item) # dispatcher.connect(crawler_results, signal=signals.item_passed) # process = CrawlerProcess({ # 'ITEM_PIPELINES': {'__main__.ItemCollectorPipeline':100} # }) crawler = Crawler(shoppingSpider) crawler.signals.connect(crawler_results, signals.item_scraped) process = CrawlerProcess(get_project_settings()) process.crawl(crawler) process.start( ) # the script will block here until the crawling is finished return results
def setUp(self): pass from scrapy.spider import Spider from scrapy.utils.test import get_crawler from scrapy.crawler import Crawler #self.crawler = get_crawler(self.settings_dict) self.crawler = Crawler(get_project_settings()) self.spider = Spider('foo') from scrapy import log import sys from cStringIO import StringIO self.level = log.INFO self.level = log.DEBUG self.encoding = 'utf-8' self.f = StringIO() self.f = sys.stdout self.sflo = log.ScrapyFileLogObserver(self.f, self.level, self.encoding) self.sflo.start()
def run(self): open(self.scrapy_log_file, 'w').close() log.start(logfile=self.scrapy_log_file, loglevel="WARNING", logstdout=False) cur = self.conn.cursor() cur.execute('SET NAMES UTF8') cur.execute('USE %s' % self.database) cur.execute( 'SELECT url, notes FROM {table}'.format(table=self.urls_table)) res = cur.fetchall() start_urls = {i[0]: i[1] for i in res} if not start_urls: return self.crawler_list = [] for url in start_urls.keys(): url = url.strip() if not url.startswith("http://") and not url.startswith( "https://"): url = "http://%s/" % url # 创建一个爬虫实例 crawler = Crawler(self.settings) spider = AutoSpider(self.conn, self.database, self.webpages_table, self.urls_table, self.log_table, url, start_urls[url]) self.crawler_list.append(spider) crawler.configure() crawler.signals.connect(self.spider_closing, signal=signals.spider_closed ) # 当spider终止时,自动调用spider_closing函数 crawler.crawl(spider) crawler.start() self.flag = 1 reactor.run()
def pytest_funcarg__spider(request): """Use scrapy's overrides to start a spider w/ specific settings""" # This is necessary because the spider errors when a source file is not # provided. settings = get_project_settings() settings.overrides['URLS'] = u"spade/tests/sitelists/urls.txt" settings.overrides['LOG_ENABLED'] = True # Initialize and return spider spider = GeneralSpider() spider.set_crawler(Crawler(settings)) now = spider.get_now_time() spider.batch = model.Batch.objects.create(kickoff_time=now, finish_time=now) spider.batch.save() # Delete created batch from database when test is done request.addfinalizer(lambda: spider.batch.delete()) return spider
def create_crawler_object(spider_, settings_): """ For the given scrapy settings and spider create a crawler object Args: spider_ (class obj): The scrapy spider class object settings_(class obj): The scrapy settings class object Returns: A scrapy crawler class object """ crwlr = Crawler(settings_) crwlr.configure() crwlr.crawl(spider_) return crwlr
def scrape_for_versions(xy_versions, dist_dir, allow_prompt=False): spider_kwargs = { 'dist_dir': dist_dir, 'allow_prompt': allow_prompt, } runner = CrawlerRunner() for spider_class in (JenkinsRPMScraper, JenkinsWarScraper): for xy_version in xy_versions: crawler = Crawler(spider_class, SCRAPY_SETTINGS) runner.crawl(crawler, xy_version=xy_version, **spider_kwargs) deferred = runner.join() # stop the reactor on success or error deferred.addBoth(lambda _: reactor.stop()) try: reactor.run() except ReactorNotRestartable: # This is an expection. We aren't trying to restart the reactor at this point, # since it should have been stopped with the callback. Regardless, twisted still # throws this exception and I didn't feel terribly interested in finding out why. pass
class UrlCrawlerScript(Process): def __init__(self, spider): Process.__init__(self) setting = Settings() setting.setmodule(settings,1) self.crawler = Crawler(setting) if not hasattr(project, 'crawler'): self.crawler.configure() self.crawler.signals.connect(reactor.stop, signal = signals.spider_closed) self.spider = spider def run(self): self.crawler.crawl(self.spider) self.crawler.start() reactor.run()
def results_from_json(): """Return results by parsing a JSON file.""" from scrapy.http import TextResponse crawler = Crawler(spidercls=aps_spider.APSSpider) spider = aps_spider.APSSpider.from_crawler(crawler, aps_token="secret") parsed_items = list( spider.parse( fake_response_from_file( 'aps/aps_single_response.json', response_type=TextResponse, ))) class MockFailure: """Mock twisted.python.failure.Failure, failure on JATS request.""" def __init__(self): self.request = parsed_items[0] records = [spider._parse_json_on_failure(MockFailure()).record] assert records return records
def get_crawler(self, spider): """ do some specific settings :param spider: spider class :return: crawler """ settings = crawler_runner.settings # FIX it! # conf = {} # log_file = crawler_runner.settings.get('LOG_FILE') # if log_file: # conf['LOG_FILE'] = '%s.%s' % (log_file, spider.name) # conf['LOG_FILE'] = None # conf['LOG_FORMAT'] = ('%(levelname)1.1s [%(asctime)s]' # ' [spider-{spider}]' # ' %(message)s' # ).format(spider=spider.name) # settings = updated_crawler_settings(settings, conf) # configure_logging(settings) return Crawler(spider, settings)
def crawl_spider(domain, day1, day2): spider_dict ={'agoda.com': AgodaSpider, 'ivivu.com': IvivuSpider} args = {'from_date': datetime.now() + timedelta(days=day1), 'to_date' : datetime.now() + timedelta(days=day2) } print "\n crawl spider===========" spider = spider_dict.get(domain, AgodaSpider) spider = spider(args) settings_module = import_module('scraper.scraper.settings') settings = CrawlerSettings(settings_module) settings.overrides['SPIDER_MODULES'] = ['scraper.scraper.spiders'] # settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() log.start() reactor.run()
def test_spider_custom_settings_log_level(self): log_file = self.mktemp() with open(log_file, 'wb') as fo: fo.write('previous message\n'.encode('utf-8')) class MySpider(scrapy.Spider): name = 'spider' custom_settings = { 'LOG_LEVEL': 'INFO', 'LOG_FILE': log_file, # settings to avoid extra warnings 'REQUEST_FINGERPRINTER_IMPLEMENTATION': 'VERSION', 'TELNETCONSOLE_ENABLED': telnet.TWISTED_CONCH_AVAILABLE, } configure_logging() self.assertEqual(get_scrapy_root_handler().level, logging.DEBUG) crawler = Crawler(MySpider, {}) self.assertEqual(get_scrapy_root_handler().level, logging.INFO) info_count = crawler.stats.get_value('log_count/INFO') logging.debug('debug message') logging.info('info message') logging.warning('warning message') logging.error('error message') with open(log_file, 'rb') as fo: logged = fo.read().decode('utf-8') self.assertIn('previous message', logged) self.assertNotIn('debug message', logged) self.assertIn('info message', logged) self.assertIn('warning message', logged) self.assertIn('error message', logged) self.assertEqual(crawler.stats.get_value('log_count/ERROR'), 1) self.assertEqual(crawler.stats.get_value('log_count/WARNING'), 1) self.assertEqual( crawler.stats.get_value('log_count/INFO') - info_count, 1) self.assertEqual(crawler.stats.get_value('log_count/DEBUG', 0), 0)
def run(self, args, opts): if len(args) != 1: raise UsageError("Please pass one website URL as argument") site = args[0] crawler = Crawler(Spider) self.crawler_process.crawl( crawler, site=site, opengraph=self.settings["OPENGRAPH"], disqus=self.settings["DISQUS"], **opts.spargs, ) self.crawler_process.start() if self.crawler_process.bootstrap_failed: self.exitcode = 1 exception_count = crawler.stats.get_value("weblint_errors") if exception_count: print("FAILED: See errors above") self.exitcode = 1 else: print("SUCCESS")
def record(): """Return results generator from the crossref spider. All fields, one record. """ def _get_record_from_processed_item(item, spider): crawl_result = pipeline.process_item(item, spider) validate(crawl_result['record'], 'hep') assert crawl_result return crawl_result['record'] crawler = Crawler(spidercls=crossref_spider.CrossrefSpider) spider = crossref_spider.CrossrefSpider.from_crawler(crawler, 'fakedoi') fake_response = fake_response_from_file( 'crossref/sample_crossref_record.json', response_type=TextResponse, ) parsed_items = spider.parse(fake_response) pipeline = InspireCeleryPushPipeline() pipeline.open_spider(spider) yield _get_record_from_processed_item(parsed_items, spider) clean_dir()
def test_spider_custom_settings_log_append(self): log_file = self.mktemp() with open(log_file, 'wb') as fo: fo.write('previous message\n'.encode('utf-8')) class MySpider(scrapy.Spider): name = 'spider' custom_settings = { 'LOG_FILE': log_file, 'LOG_FILE_APPEND': False, # disable telnet if not available to avoid an extra warning 'TELNETCONSOLE_ENABLED': telnet.TWISTED_CONCH_AVAILABLE, } configure_logging() Crawler(MySpider, {}) logging.debug('debug message') with open(log_file, 'rb') as fo: logged = fo.read().decode('utf-8') self.assertNotIn('previous message', logged) self.assertIn('debug message', logged)
def run_crawler(keywords, proxies, search_type): """ :param keywords: a list of keywords to be used as search terms (unicode characters supported) :param proxies: one of them selected and used randomly to perform all the HTTP requests (you can get a free list of proxies to work with at https://free-proxy-list.net/) :param search_type: the type of object we are searching for (Repositories, Issues and Wikis supported) """ result = [] def collect_items(item, response, spider): result.append(item) crawler = Crawler(GitSpider) crawler.signals.connect(collect_items, signals.item_scraped) process = CrawlerProcess(get_project_settings()) process.crawl(crawler, query=' '.join(keywords), proxy=random.choice(proxies), search_type=search_type) process.start() return result
class WebCrawler(): def __init__(self): default_settings.ITEM_PIPELINES = 'pipelines.JsonExportPipeline' self.crawler = Crawler(Settings()) self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed) self.crawler.configure() def _crawl(self, url): spider = MySpiders.TvShowSpider(start_url=url) self.crawler.crawl(spider) self.crawler.start() reactor.run() def run(self, url): p = Process(target=self._crawl, args=[url]) p.start() p.join()
def __init__(self, splash_url, crawler_depth_limit): self.process = CrawlerProcess({'LOG_ENABLED': False}) self.crawler = Crawler( self.TorSplashSpider, { 'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0', 'SPLASH_URL': splash_url, 'ROBOTSTXT_OBEY': False, 'DOWNLOADER_MIDDLEWARES': { 'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, }, 'SPIDER_MIDDLEWARES': { 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, }, 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', 'HTTPERROR_ALLOW_ALL': True, 'RETRY_TIMES': 2, 'DEPTH_LIMIT': crawler_depth_limit })