def start_crawler( data, **kwargs): # todo base spider logic, modularity (time-consuming) idx = kwargs.get('idx') process = CrawlerProcess(get_project_settings()) items = [] def item_scraped(item, response, spider): items.append(item) if 'source' in kwargs: # checks if crawler needs a source site and keyword to search with crawler = process.create_crawler() source = kwargs.get('source') process.crawl('Knipex', url=data[source].tolist(), idx=data[idx].tolist()) else: crawler = process.create_crawler('Astro') crawler.signals.connect(item_scraped, signal=signals.item_passed) process.crawl( crawler, url=data[idx].tolist()) # hits the page directly without searching process.start() return items
class taskService(service.MultiService) : def __init__(self, taskId, taskName, setting): service.MultiService.__init__(self) self.taskId = taskId self.name = taskName self.setting = setting self._crawlerProcess = CrawlerProcess(project_settings) self._crawlerProcess.create_crawler('start_page_crawler') # self._crawlerProcess.create_crawler('list_page_crawler') # self._crawlerProcess.create_crawler('content_page_crawler') # self._crawlerProcess.create_crawler('extra_page_crawler') # # self._listPageService = listPageService() # self._contentPageService = contentPageService() # self._extraPageService = extraPageService() def startService(self): _spider_start_page_setting = self.setting.get(SPIDER_TYPE_START_PAGE) if _spider_start_page_setting is not None: _spider_start_page_setting['szStartUrl']=self.setting.get('szStartUrl') _spider_start_page_setting['szRegStartUrl']=self.setting.get('szRegStartUrl') # self._startStartPageSpider(_spider_start_page_setting) self._crawlerProcess.start() # sService = startPageService(self) lSeevice = listPageService(self) cService = contentPageService(self) # eService = extraPageService(self) # self.addService(sService) self.addService(lSeevice) self.addService(cService) # self.addService(eService) service.MultiService.startService(self) log.msg('taskService->startService') def _startStartPageSpider(self, config): startPageCrawler = self._crawlerProcess.crawlers.get('start_page_crawler') print '======>' print startPageCrawler # startPageSpider = load_object(config.get('szSnameSpace')) # startPageCrawler.crawl(startPageSpider) def stopService(self): service.MultiService.stopService(self) log.msg('taskService->stopService')
def lambda_handler(event, context): tz = pytz.timezone(TIMEZONE) now_str = datetime.now(tz).strftime("%d-%m-%Y_%H:%M") scrapy_settings = SCRAPY_SETTINGS scrapy_settings["FEED_URI"] = scrapy_settings["FEED_URI"].format(now_str) process = CrawlerProcess(scrapy_settings) process.crawl(VillaSpider) crawler = process.create_crawler(VillaSpider) process.crawl(crawler) process.start() stats = crawler.stats.get_stats() mean_response_time = statistics.mean(crawler.spider.response_times) result = f"{stats['downloader/request_count']} requests " \ f"with an average response time of {round(mean_response_time, 2)} seconds" return { "statusCode": 200, "body": json.dumps({ "message": result, }), }
def f(queue): process = CrawlerProcess(get_project_settings()) crawl = process.create_crawler(spider) process.crawl(crawl, **kwargs) process.start() data = crawl.spider.get_data() queue.put(data)
def startSpider(group_type, spider_type, spider_group_name, spider_name): #调用Scrapy内部方法 settings = get_project_settings() #实例化一个爬虫进程 crawlerProcess = CrawlerProcess(settings) #创建一个爬虫,一个爬取处理器可以,运行多个爬取。 crawler = crawlerProcess.create_crawler(spider_name) #设置爬虫的状态。 当爬虫发出该信号后,调用响应的方法。 crawler.signals.connect(spiderSignal.startSingnal, signals.spider_opened) crawler.signals.connect(spiderSignal.idleSingnal, signals.spider_idle) crawler.signals.connect(spiderSignal.errorSingnal, signals.spider_error) crawler.signals.connect(spiderSignal.stopSingnal, signals.spider_closed) #获取爬取类 spiderConf = Spider_Dict[group_type][spider_type] spiderArgs = spiderConf[1].copy() spiderArgs["name"] = spider_name spiderArgs["redis_key"] = spider_name spiderArgs["spider_type"] = spider_type spiderArgs["spider_group_name"] = spider_group_name spiderArgs["task_id"] = "-1" spider = spiderConf[0](**spiderArgs) #给爬虫设置爬取类 crawler.configure() crawler.crawl(spider) #爬虫启动。 crawlerProcess.start() crawlerProcess.stop()
def startSpiderTest(group_type,spider_type,spider_group_name,spider_name): #调用Scrapy内部方法 settings = get_project_settings() #实例化一个爬虫进程 crawlerProcess = CrawlerProcess(settings) #创建一个爬虫,一个爬取处理器可以,运行多个爬取。 crawler = crawlerProcess.create_crawler(spider_name) #设置爬虫的状态。 当爬虫发出该信号后,调用响应的方法。 crawler.signals.connect(spiderSignal.startSingnal, signals.spider_opened) crawler.signals.connect(spiderSignal.errorSingnal, signals.spider_error) crawler.signals.connect(spiderSignal.stopSingnal, signals.spider_closed) #获取爬取类 spiderConf = Spider_Dict[group_type][spider_type] spiderArgs = spiderConf[1].copy() spiderArgs["name"] = spider_name spiderArgs["redis_key"] = spider_name spiderArgs["spider_type"] = spider_type spiderArgs["spider_group_name"] = spider_group_name spiderArgs["task_id"] = "-1" spider = spiderConf[0](**spiderArgs) #给爬虫设置爬取类 crawler.configure() crawler.crawl(spider) #爬虫启动。 crawlerProcess.start() crawlerProcess.stop()
def run(cls, dependencies): process = CrawlerProcess(dependencies.scrapy_settings) crawler = process.create_crawler(cls) process.crawl(crawler, dependencies) process.start( ) # the script will block here until the crawling is finished return crawler
def get_fetch(log=False): settings = Settings() settings.set('LOG_ENABLED', log) crawler_process = CrawlerProcess(settings) crawler = crawler_process.create_crawler() crawler_process.start_crawling() t = Thread(target=crawler_process.start_reactor) t.daemon = True t.start() shell = Shell(crawler) shell.code = 'adsf' import threading lock = threading.Lock() def fetch(url_or_request): lock.acquire() try: shell.fetch(url_or_request) response = shell.vars.get('response') return response finally: lock.release() return fetch
class ScrapyPuppeteerTestCase(TestCase): """Test case for the ``scrapy-puppeteer`` package""" class PuppeteerSpider(scrapy.Spider): name = 'puppeteer_crawl_spider' allowed_domains = ['ufmg.br'] items = [] def start_requests(self): yield scrapy_puppeteer.PuppeteerRequest('https://ufmg.br', wait_until='networkidle2') def parse(self, response): for selector_item in response.selector.xpath( '//*[@id="rodape"]/section[1]/div/div[1]/div/ol/li'): self.items.append(selector_item) def setUp(self): """Store the Scrapy runner to use in the tests""" self.settings = custom_settings = { 'DOWNLOADER_MIDDLEWARES': { 'scrapy_puppeteer.PuppeteerMiddleware': 800 } } self.process = CrawlerProcess(settings=self.settings) def test_items_number(self): crawler = self.process.create_crawler(self.PuppeteerSpider) self.process.crawl(crawler) self.process.start() self.assertEqual(len(crawler.spider.items), 12)
def main(): print( "Inserisci un ISBN e premi invio per raccogliere i dati, oppure inserisci 'stop' per terminare" ) mySpider = "bookspider" process = CrawlerProcess(get_project_settings()) crawler = process.create_crawler(mySpider) # Connetti la funzione close_spider al segnale spider_closed crawler.signals.connect(close_spider, signals.spider_closed) # Ricevi ISBN global ISBN_RECEIVED if ISBN_RECEIVED is not None: # Se avvio con argomento # Avvia il processo assegnato allo spider process.crawl(crawler, isbn=ISBN_RECEIVED) process.start() else: # Altrimenti se avvio senza argomento while ISBN_RECEIVED != "stop": ISBN_RECEIVED = input("\n[ISBN] > ") if ISBN_RECEIVED != "stop": # Avvia il processo assegnato allo spider process.crawl(crawler, isbn=ISBN_RECEIVED) process.start() ISBN_RECEIVED = None # Permetti di rieseguire il proceso, da https://stackoverflow.com/a/47127561 time.sleep(0.5) os.execl(sys.executable, sys.executable, *sys.argv)
def execute_spiders(urls, run_name): process = CrawlerProcess(get_project_settings()) spiders = [] export_headers = True for url in urls: if "nepremicnine.net" in url: spider_name = "nepremicnine" elif "bolha.com" in url: spider_name = "bolha" else: print("No spdider for url: " + url + ", skipping ...") continue spider = process.create_crawler(spider_name) spiders.append(spider) process.crawl(spider, url=url, run_name=run_name, export_headers=export_headers) export_headers = False #so only first wil export them process.start( ) # the script will block here until the crawling is finished for spider in spiders: stats = spider.stats.get_stats() print("Spider " + spider.spider.name + " executed in " + str(stats.get("elapsed_time_seconds"))) print(" Scraped " + str(stats.get("item_scraped_count", 0)) + " items") if "log_count/ERROR" in stats: print(" Errors in spider " + spider.name + "!!!") print()
def check(urls, auth=None, crawl=True, robotstxt=True, verbosity=0): """Crawl a list of url""" # we somehow need to append an empty string for bold to work puts(colored.white('Checking forms:', bold=True) + '') settings = get_project_settings() if verbosity > 0: settings.set('LOG_ENABLED', True) log_levels = ['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG'] if verbosity >= len(log_levels): verbosity = len(log_levels) - 1 settings.set('LOG_LEVEL', log_levels[verbosity]) if not robotstxt: settings.set('ROBOTSTXT_OBEY', False) process = CrawlerProcess(settings) crawler = process.create_crawler('form') result = Result() crawler.signals.connect(result.add_item, scrapy.signals.item_scraped) process.crawl(crawler, urls=urls, crawl=crawl, auth=auth) process.start() result.print(verbosity)
def start_userscrapers(list_dicts): reset_file("user_gathering_data.txt") reset_file("usernames.txt") process = CrawlerProcess() for dictionary in list_dicts: argument = dictionary process.crawl(process.create_crawler(UserScraper), argument) process.start(True)
def execute(self): # Initialise settings for a limited scraping os.environ.setdefault( 'SCRAPY_SETTINGS_MODULE', 'wsf_scraping.settings' ) if not self.dst_s3_dir.startswith('s3://'): raise ValueError('Invalid S3 url: %s' % self.dst_s3_dir) # This monkey-patching only works because Airflow shells out to # a new Python interpreter for every task it runs. It thus *must* # remain inside execute(), so other code paths don't touch it. wsf_scraping.settings.MAX_ARTICLE = self.item_max wsf_scraping.settings.WHO_IRIS_YEARS = \ self.item_years wsf_scraping.settings.FEED_URI = \ 'manifest' + self.dst_s3_dir settings = get_project_settings() self.log.info( "scrapy settings: %s", json.dumps( {k: v for k, v in settings.items() if isinstance(v, (str, int, float, bool))} ) ) process = CrawlerProcess(settings, install_root_handler=False) spider = SPIDERS[self.organisation] crawler = process.create_crawler(spider) self.item_count = None self.scraper_errors = [] crawler.signals.connect( self.on_item_error, signal=scrapy.signals.item_error) crawler.signals.connect( self.on_manifest_storage_error, signal=feed_storage.manifest_storage_error) process.crawl(crawler) # starts reactor process.start() # waits for reactor to finish if self.scraper_errors: scraper_errors = self.scraper_errors # put into local for sentry self.log.error( 'SpiderOperator: scrapy signaled %d errors:', len(scraper_errors) ) for tup in self.scraper_errors: self.log.error('DummySpiderOperator: %r', tup) raise Exception( "%d errors occurred during scrape" % len(scraper_errors) )
def populate(cls, season, *args, **kwargs): print(f'Scraping {season} covers') settings = get_project_settings() process = CrawlerProcess(settings) crawler = process.create_crawler(GameSpider) process.crawl(crawler, season=season, *args, **kwargs) process.start() games = crawler.stats.get_value('games', 0) print(f'Saved {games} rows to {cls.__tablename__}')
def run_spiders(name, **kwargs): """ @param name: spider name """ prs = CrawlerProcess( CrawlerSettings(scrapy_settings) ) crawler = prs.create_crawler() for spdname, spd in crawler.spiders._spiders.iteritems(): if name == spdname: spidercls = spd crawler.crawl( spidercls(**kwargs) ) prs.start()
def main(): process = CrawlerProcess( {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}) crawler = process.create_crawler(MySpider) crawler.signals.connect(response_received, signal=signals.response_received) crawler.signals.connect(engine_started, signal=signals.engine_started) crawler.signals.connect(spider_opened, signal=signals.spider_opened) crawler.signals.connect(spider_error, signal=signals.spider_error) #crawler.crawl() #import pdb; pdb.set_trace() process.crawl(crawler) process.start()
def crawling_start( self, scrapy_settings: Settings, spider: object, board_code: str, return_dic: Dict) -> Dict: process = CrawlerProcess(scrapy_settings) crawler = process.create_crawler(spider) process.crawl(crawler, args={'callback': self._yield_output}) process.start() return_dic[board_code] = self.output # stats = crawler.stats # <class 'scrapy.statscollectors.MemoryStatsCollector'> stats = crawler.stats.get_stats() # <class 'dict'> return stats
class CrawlerWorker(Process): def __init__(self, spider, result_list, settings=None): Process.__init__(self) self.result_queue = result_list if settings is None: settings = Settings() self.crawler = CrawlerProcess(settings) self.crawler.create_crawler(spider.__class__.__name__) self.crawler.crawlers['spider'] = spider self.spider = spider self.items = [] dispatcher.connect(self._item_passed, signals.item_passed) def _item_passed(self, item): self.items.append(item) print "here" def run(self): self.crawler.start() self.crawler.stop() self.result_queue.put(self.items)
def shell(argv): """ Open a url in the scrapy shell """ parser = argparse.ArgumentParser('ozzy shell', description=shell.__doc__) parser.add_argument('url', help="URL to open in a shell") args = parser.parse_args(argv) crawler_process = CrawlerProcess(load_settings()) crawler = crawler_process.create_crawler() crawler_process.start_crawling() thread = Thread(target=crawler_process.start_reactor) thread.daemon = True thread.start() sh = Shell(crawler) sh.start(url=args.url)
class MainCrawler: def __init__(self): self.settings = get_project_settings() self.session = Session() def run(self, url=TARGET_URL): self.process = CrawlerProcess(self.settings) self.url = url self.flag = "top_urls" configure_logging() self.crawl_url() self.process.start() # Use repetedly def _crawl(self, spider, callback, urls=None): crawler = self.process.create_crawler(spider) crawler.signals.connect(callback, signal=signals.spider_closed) self.process.crawl(crawler, urls) def crawl_url(self, url=None): if not url: url = self.url spider = UrlSpider(url=url) self._crawl(spider, self.callbacks) def crawl_page(self, urls=None): logging.info(urls[0]) spider = WebpageSpider(url=urls) self._crawl(spider, self.callbacks, urls) # Each functions def crawl_top_pages(self): logging.info("START CRAWLING TOP PAGES") self.flag = "top_pages" pages_q = self.session.query(Webpage.original_url).filter( and_(Webpage.html == None, Webpage.path_label == self.url)) page_list = [] for p in pages_q: page_list.append(p[0]) self.crawl_page(page_list) # Callbacks def callbacks(self, spider=None, urls=None): logging.info("START CALLBACKS") if self.flag == "top_urls": self.crawl_top_pages() elif self.flag == "top_pages": reactor.stop()
def _runCrawler(spider, results): settings_module = importlib.import_module('Extractors.HTMLScraper.settings') settings = CrawlerSettings(settings_module) crawlerProcess = CrawlerProcess(settings) items = [] def _item_passed(item, response, spider): items.append(item) dispatcher.connect(_item_passed, signals.item_scraped) crawler = crawlerProcess.create_crawler("currentCrawler") crawler.crawl(spider) crawlerProcess.start() crawlerProcess.stop() results.put(items)
def run_spider(spider, settings, loglevel='INFO'): """ Run a spider with given settings """ if 'SENTRY_DSN' in os.environ: import scrapy_sentry settings.setdict({ 'SENTRY_DSN': os.environ['SENTRY_DSN'], 'EXTENSIONS': { "scrapy_sentry.extensions.Errors": 10, }, }) crawler_process = CrawlerProcess(settings) crawler = crawler_process.create_crawler() crawler.crawl(spider) crawler_process.start()
def _runCrawler(spider, results): settings_module = importlib.import_module( 'Extractors.HTMLScraper.settings') settings = CrawlerSettings(settings_module) crawlerProcess = CrawlerProcess(settings) items = [] def _item_passed(item, response, spider): items.append(item) dispatcher.connect(_item_passed, signals.item_scraped) crawler = crawlerProcess.create_crawler("currentCrawler") crawler.crawl(spider) crawlerProcess.start() crawlerProcess.stop() results.put(items)
def main(args): settings = Settings() settings.setmodule(iw_settings) spider = ThedySpider() process = CrawlerProcess(settings) crawler = process.create_crawler(spider) crawler.signals.connect(item_scraped, signal=signals.item_scraped) process.crawl(crawler) process.start(stop_after_crawl=True) process.join() result["scraping_time"] = result["scraping_time"].isoformat() doc = {"doc": dict(result)} return doc
def run(event, context): items = [] def add_item(item): items.append(item) # Create and run the crawler, scrapy stuff process = CrawlerProcess(get_project_settings()) crawler = process.create_crawler('broken_link_spider') crawler.signals.connect(add_item, signals.item_passed) # Intercept the results process.crawl(crawler) process.start() # Convert results to json and send email json_string = json.dumps([ob.__dict__ for ob in items]) print("Found broken links:", json_string) send_simple_message(EMAIL, json_string)
def start_malscrapers(args_list): # Value checks try: if len(args_list) < 1: raise ValueError( "args_list length less than 1. THIS SHOULD NEVER HAPPEN") except ValueError: raise # Process that holds spiders process = CrawlerProcess() for args in args_list: # Add spiders and arguments to each spider arguments = list(args) process.crawl(process.create_crawler(Malscraper), arguments) process.start(True)
def run_spiders_concurrently(spiders: dict): default_settings = get_project_settings() default_settings["LOG_LEVEL"] = "ERROR" process = CrawlerProcess(default_settings) crawlers = dict() for name, spider_class in spiders.items(): logging.info(f"running {name}") crawler = process.create_crawler(spider_class) crawlers[name] = crawler try: process.crawl(crawler) except (AttributeError, TypeError, KeyError, ValueError, ImportError) as e: logging.error(e) continue process.start()
class GetResultCrawler(object): def __init__(self): self.crawled_items = [] settings = get_project_settings() settings["LOG_LEVEL"] = logging.WARNING self.process = CrawlerProcess(settings) def crawl(self, spider_dict): def _add_crawled_item(item): if item: self.crawled_items.append(item) for spider_name, spider_kwargs in spider_dict.items(): crawler = self.process.create_crawler(spider_name) crawler.signals.connect(_add_crawled_item, signals.item_scraped) self.process.crawl(crawler, **spider_kwargs) self.process.start(stop_after_crawl=True) return self.crawled_items
def main(args): settings = Settings() settings.setmodule(iw_settings) spider = ThedySpider() process = CrawlerProcess(settings) crawler = process.create_crawler(spider) crawler.signals.connect(item_scraped, signal=signals.item_scraped) process.crawl(crawler) process.start(stop_after_crawl=True) process.join() result["scraping_time"] = result["scraping_time"].isoformat() doc = { "doc": dict(result) } return doc
def run_one_spider(spider_name): try: settings = get_project_settings() _, output_log = tempfile.mkstemp('.log') _, output_results = tempfile.mkstemp('.geojson') settings.set('LOG_FILE', output_log) settings.set('LOG_LEVEL', 'INFO') settings.set('TELNETCONSOLE_ENABLED', False) settings.set('FEED_URI', output_results) settings.set('FEED_FORMAT', 'ndgeojson') def spider_opened(spider): logger.info("Spider %s opened, saving to %s", spider.name, output_results) def spider_closed(spider): logger.info( "Spider %s closed (%s) after %0.1f sec, %d items", spider.name, spider.crawler.stats.get_value('finish_reason'), (spider.crawler.stats.get_value('finish_time') - spider.crawler.stats.get_value('start_time')).total_seconds(), spider.crawler.stats.get_value('item_scraped_count') or 0, ) process = CrawlerProcess(settings) crawler = process.create_crawler(spider_name) crawler.signals.connect(spider_closed, signals.spider_closed) crawler.signals.connect(spider_opened, signals.spider_opened) process.crawl(crawler) process.start() results = crawler.stats.spider_stats.get(spider_name) results['output_filename'] = output_results results['log_filename'] = output_log results['spider'] = spider_name return results except Exception as e: logger.exception("Exception in scraper process")
def run_spider(spider, bail=False, debug=False, **kwargs): def process_spider_error(failure, response, spider): nonlocal had_error had_error = True had_error = False spider_middlewares = {} if bail: spider_middlewares['kpopnet.spiders.HttpErrorMiddleware'] = 1 process = CrawlerProcess({ 'LOG_LEVEL': 'DEBUG' if debug else 'WARNING', 'USER_AGENT': USER_AGENT, 'CLOSESPIDER_ERRORCOUNT': 1 if bail else 0, 'CONCURRENT_REQUESTS_PER_DOMAIN': 3, 'SPIDER_MIDDLEWARES': spider_middlewares, }) crawler = process.create_crawler(spider) crawler.signals.connect(process_spider_error, signals.spider_error) process.crawl(crawler, **kwargs) process.start() return 1 if had_error else 0
class CrawlerWorker(Process): def __init__(self, spider, results): Process.__init__(self) self.results = results settings_module = importlib.import_module('Extractors.HTMLScraper.settings') settings = CrawlerSettings(settings_module) self.crawlerProcess = CrawlerProcess(settings) self.items = [] self.spider = spider dispatcher.connect(self._item_passed, signals.item_passed) def _item_passed(self, item): self.items.append(item) def run(self): crawler = self.crawlerProcess.create_crawler("currentCrawler") crawler.crawl(self.spider) self.crawlerProcess.start() self.crawlerProcess.stop() self.results.put(self.items)
def task_spider(self, type_id, time_from, time_to): settings = Settings() settings_module_path = os.environ.get('SCRAPY_ENV', 'BilibiliTagSpider.settings') settings.setmodule(settings_module_path, priority='project') process = CrawlerProcess(settings=settings) crawler = process.create_crawler(TagSpider) thread_spider = Thread(target=spider_crawl, args=(process, crawler, type_id, time_from, time_to)) thread_spider.start() video_cur = 0 video_total = 0 while True: if (crawler.stats.get_value(ScrapyField.VideoCur.value) is None or \ crawler.stats.get_value(ScrapyField.VideoTotal.value) is None): continue video_cur = crawler.stats.get_value(ScrapyField.VideoCur.value) video_total = crawler.stats.get_value(ScrapyField.VideoTotal.value) current_task.update_state(state='PROGRESS', meta={ ScrapyField.VideoCur.value: video_cur, ScrapyField.VideoTotal.value: video_total, }) time.sleep(1) # 线程有时候不会释放,还是从业务逻辑上跳出循环 if video_cur == video_total: break task_ended(self.request.id) return { ScrapyField.VideoCur.value: crawler.stats.get_value(ScrapyField.VideoCur.value), ScrapyField.VideoTotal.value: crawler.stats.get_value(ScrapyField.VideoTotal.value), }
def start_idscrapers(start_ends): process = CrawlerProcess() for values in start_ends: arguments = list(values) process.crawl(process.create_crawler(Idscraper), arguments) process.start(True)
import scrapy from scrapy.crawler import CrawlerProcess from bd.spiders.sc import SCSpider crawler_process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' }) crawler = crawler_process.create_crawler() spider = crawler.spiders.create('sc') crawler.crawl(spider) crawler_process.start()
class CrawlerProcessScript(object): """ Creates multiple crawlers and call them sequentially Crawler names should follow this naming convention: spider_name + _ + city + _ + category crawlers : keeps track of all crawlers run, so to get their stats after they are finished. """ def __init__(self, dsite_name='', updating=False): self.updating = str(updating) self.dsite = DSite.objects.get(name=dsite_name) self.crawler_process = CrawlerProcess(get_project_settings()) self.crawlers = {} def _add_crawler(self, crawler_name, city_mapping_pk=None, category_mapping_pk=None): crawler = self.crawler_process.create_crawler(crawler_name) spider = crawler.spiders.create( self.dsite.name, dsite_pk=self.dsite.pk, city_mapping_pk=city_mapping_pk, category_mapping_pk=category_mapping_pk, updating=self.updating) crawler.crawl(spider) self.crawlers[crawler_name] = crawler def _create_crawlers(self): if self.dsite.has_both_mappings: for city_mapping in CityMapping.objects.filter(dsite=self.dsite): for category_mapping in CategoryMapping.objects.filter( dsite=self.dsite, all_cities=False): crawler_name = self.dsite.name + '_' + city_mapping.site_city + '_' + category_mapping.site_category self._add_crawler(crawler_name=crawler_name, category_mapping_pk=category_mapping.pk, city_mapping_pk=city_mapping.pk) if self.dsite.has_category_mapping: for category_mapping in CategoryMapping.objects.filter( dsite=self.dsite, all_cities=True): crawler_name = self.dsite.name + '_' + category_mapping.site_category self._add_crawler(crawler_name=crawler_name, category_mapping_pk=category_mapping.pk) elif self.dsite.has_city_mapping: for city_mapping in CityMapping.objects.filter(dsite=self.dsite): crawler_name = self.dsite.name + '_' + city_mapping.site_city self._add_crawler(crawler_name=crawler_name, city_mapping_pk=city_mapping.pk) if self.dsite.has_category_mapping: for category_mapping in CategoryMapping.objects.filter( dsite=self.dsite, all_cities=True): crawler_name = self.dsite.name + '_' + category_mapping.site_category self._add_crawler(crawler_name=crawler_name, category_mapping_pk=category_mapping.pk) elif self.dsite.has_category_mapping: for category_mapping in CategoryMapping.objects.filter( dsite=self.dsite): crawler_name = self.dsite.name + '_' + category_mapping.site_category self._add_crawler(crawler_name=crawler_name, category_mapping_pk=category_mapping.pk) def start(self): self._create_crawlers() self.crawler_process.start() self.crawler_process.stop() self.crawler_process.stop_reactor() def dump_stats(self): for crawler_name, crawler in self.crawlers.iteritems(): print crawler_name print crawler.stats.get_stats()
def engine_started(): print('Engine started.') def spider_opened(spider): print('Spider opened.') def spider_error(failure, response, spider): print('Spider error.') process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' }) crawler = process.create_crawler(ScannerSpider()) crawler.signals.connect(response_received, signal=signals.response_received) crawler.signals.connect(engine_started, signal=signals.engine_started) crawler.signals.connect(spider_opened, signal=signals.spider_opened) crawler.signals.connect(spider_error, signal=signals.spider_error) #crawler.crawl() #import pdb; pdb.set_trace() process.crawl(crawler) process.start()
def __init__(self, artist='', *args, **kwargs): """Takes artist as an argument when called""" super(EthnicScraperSpider, self).__init__(*args, **kwargs) # Replaces spaces with "-" for website artist = artist.replace(" ", "-") self.start_urls = [f'https://ethnicelebs.com/{artist}'] def parse(self, response): """Get artist race""" next_page = response.xpath( '/html/body/div/div/div/div/div/section/div[2]/article/div/div[2]/div[1]/p[4]/strong/text()' ).get() # Find word and exclude string_start = next_page.find("Ethnicity: ") + len("Ethnicity: ") race = next_page[string_start:len(next_page)] yield {'race': race} # Setup scraper process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'FEED_FORMAT': 'json', 'FEED_URI': '123.json', 'CONCURRENT_ITEMS': 1 }) crawler = process.create_crawler(EthnicScraperSpider) process.crawl(crawler, artist='21 savage') process.start()
import random SEARCH_INTERVAL = 1 MAX_SPIDERS = 3 if __name__ == '__main__': spiders = [] sites = list(SiteData.objects.filter(category__symbol='approved')) random.shuffle(sites) for site in sites: if ProductInfo.objects.filter(product__page__site=site, date__gt = now()-datetime.timedelta(SEARCH_INTERVAL)).exists(): continue if not ScraperDescriptor.objects.filter(site=site).exists(): continue Spider = get_spiders(site.id) if Spider is not None: spiders.append(Spider()) if len(spiders) > MAX_SPIDERS: break if spiders: settings = get_project_settings() crawler_process = CrawlerProcess(settings) for spider in spiders: crawler = crawler_process.create_crawler(name=spider.name) crawler.crawl(spider) crawler_process.start()
'item_scraped_count': spider.crawler.stats.get_value('item_scraped_count'), } print("Spider %s closed (%s) after %0.1f sec, %d items" % ( spider.name, spider.crawler.stats.get_value('finish_reason'), (spider.crawler.stats.get_value('finish_time') - spider.crawler.stats.get_value('start_time')).total_seconds(), spider.crawler.stats.get_value('item_scraped_count') or 0, )) print("Starting to crawl") process = CrawlerProcess(settings) for spider_name in process.spider_loader.list(): crawler = process.create_crawler(spider_name) crawler.signals.connect(spider_closed, signals.spider_closed) crawler.signals.connect(spider_opened, signals.spider_opened) process.crawl(crawler) process.start() print("Done crawling") client = boto3.client('s3') s3_key_prefix = "runs/{}".format(tstamp) # Gzip the output geojson with open(output_results, 'rb') as f_in: with gzip.open(output_results + '.gz', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) s3_output_size = os.path.getsize(output_results + '.gz')
class StartScan(object): """A scanner application which can be run.""" def __init__(self, configuration): """ Initialize the scanner application. Takes the JSON descriptor of this scan as its argument. """ self.configuration = configuration scan_id = configuration['id'] logfile = configuration['logfile'] last_started = configuration['last_started'] self.scan_id = scan_id self.logfile = logfile self.last_started = \ parse_datetime(last_started) if last_started else None self.sitemap_crawler = None self.scanner_crawler = None self.settings = get_project_settings() self.crawler_process = None def run(self): """Updates the scan status and sets the pid. Run the scanner, blocking until finished.""" # Each scanner process should set up logging separately, writing to # both the log file and to the scanner manager's standard error stream logging.basicConfig( level=logging.DEBUG, format="""\ %(levelname)s %(asctime)s %(module)s %(process)d %(thread)d %(message)s""", handlers=[ logging.FileHandler(self.logfile), logging.StreamHandler(stderr) ]) # Scrapy expects to be able to log things, so this call should always # happen after we've initialised the root logging handler self.crawler_process = \ CrawlerProcess(self.settings, install_root_handler=False) # A new instance of django setup needs to be loaded for the scan process, # so the django db connection is not shared between processors. from utils import run_django_setup run_django_setup() def handle_killed(self): """Handle being killed by updating the scan status.""" from os2webscanner.models.scans.scan_model import Scan self.scanner.scan_object = Scan.objects.get(pk=self.scan_id) self.scanner.scan_object.set_scan_status_failed() self.scan.logging_occurrence("SCANNER FAILED: Killed") logging.error("Killed") def make_scanner_crawler(self, spider_type): """Setup the scanner spider and crawler.""" self.scanner_crawler = \ self.crawler_process.create_crawler(spider_type) csigs = self.scanner_crawler.signals csigs.connect(self.handle_closed, signal=signals.spider_closed) csigs.connect(self.handle_error, signal=signals.spider_error) csigs.connect(self.handle_idle, signal=signals.spider_idle) return self.scanner_crawler def handle_closed(self, spider, reason): """Handle the spider being finished.""" # TODO: Check reason for if it was finished, cancelled, or shutdown logging.debug('Spider is closing. Reason {0}'.format(reason)) self.store_stats() reactor.stop() def store_stats(self): """Stores scrapy scanning stats when scan is completed.""" from os2webscanner.models.statistic_model import Statistic from django.core.exceptions import MultipleObjectsReturned logging.info('Stats: {0}'.format(self.scanner_crawler.stats.get_stats())) try: statistics, created = Statistic.objects.get_or_create(scan=self.scanner.scan_object) except MultipleObjectsReturned: logging.error('Multiple statistics objects found for scan job {}'.format( self.scan_id) ) if self.scanner_crawler.stats.get_value( 'last_modified_check/pages_skipped'): statistics.files_skipped_count += self.scanner_crawler.stats.get_value( 'last_modified_check/pages_skipped' ) if self.scanner_crawler.stats.get_value( 'downloader/request_count'): statistics.files_scraped_count += self.scanner_crawler.stats.get_value( 'downloader/request_count' ) if self.scanner_crawler.stats.get_value( 'downloader/exception_type_count/builtins.IsADirectoryError'): statistics.files_is_dir_count += self.scanner_crawler.stats.get_value( 'downloader/exception_type_count/builtins.IsADirectoryError' ) statistics.save() logging.debug('Statistic saved.') def handle_error(self, failure, response, spider): """Printing spider errors. When an exception occurs in a spider callback we do not need to stop the scan. The scan is only stopped when the spider signals it has stopped. So we only print the error to the log.""" logging.error("An error occured: %s" % failure.getErrorMessage()) def handle_idle(self, spider): """Handle when the spider is idle. Keep it open if there are still queue items to be processed. """ from os2webscanner.models.conversionqueueitem_model import ConversionQueueItem logging.debug("Spider Idle...") # Keep spider alive if there are still queue items to be processed remaining_queue_items = ConversionQueueItem.objects.filter( status__in=[ConversionQueueItem.NEW, ConversionQueueItem.PROCESSING], url__scan=self.scanner.scan_object ).count() if remaining_queue_items > 0: logging.info( "Keeping spider alive: %d remaining queue items to process" % remaining_queue_items ) raise DontCloseSpider else: logging.info("No more active processors, closing spider...")
from scrapy.utils.project import get_project_settings from XinLang_news.spiders.fudan import FudanSpider # gundongnews = FudanSpider() settings = get_project_settings() # crawlerprocess = CrawlerProcess(settings) # crawler = crawlerprocess.create_crawler() # crawler.crawl(gundongnews) # crawlerprocess.start() ############################################################################## spname_list = ['jyb','tju'] # spname = 'jyb' crawlerprocess = CrawlerProcess(settings) for spname in spname_list: crawler = crawlerprocess.create_crawler(spname) spider = crawler.spiders.create(spname) crawler.crawl(spider) crawlerprocess.start() # crawlerprocess.start_reactor() # log.start() # reactor.run()