def crawl(settings={}, spider_name="adac", spider_kwargs={}): project_settings = get_project_settings() spider_loader = SpiderLoader(project_settings) spider_cls = spider_loader.load(spider_name) feed_uri = "" feed_format = "json" try: spider_key = urlparse(spider_kwargs.get("start_urls")[0]).hostname if spider_kwargs.get( "start_urls") else urlparse(spider_cls.start_urls[0]).hostname except Exception: logging.exception("Spider or kwargs need start_urls.") if is_in_aws(): # Lambda can only write to the /tmp folder. settings['HTTPCACHE_DIR'] = "/tmp" feed_uri = f"s3://{os.getenv('FEED_BUCKET_NAME')}/%(name)s-{spider_key}.json" else: feed_uri = "file://{}/%(name)s-{}-%(time)s.json".format( os.path.join(os.getcwd(), "feed"), spider_key, ) settings['FEED_URI'] = feed_uri settings['FEED_FORMAT'] = feed_format process = CrawlerProcess({**project_settings, **settings}) process.crawl(spider_cls, **spider_kwargs) process.start()
def crawl(settings={}, spider_name="header_spider", spider_kwargs={}): project_settings = get_project_settings() spider_loader = SpiderLoader(project_settings) spider_cls = spider_loader.load(spider_name) feed_uri = "" feed_format = "json" try: spider_key = urlparse(spider_kwargs.get("start_urls")[0]).hostname if spider_kwargs.get( "start_urls") else urlparse(spider_cls.start_urls[0]).hostname except Exception: logging.exception("Spider or kwargs need start_urls.") if (is_in_aws() and os.getenv("USE_S3_CACHE") != "0") or os.getenv("USE_S3_CACHE"): settings["HTTPCACHE_STORAGE"] = "my_sls_scraper.extensions.s3cache.S3CacheStorage" settings["S3CACHE_URI"] = f"s3://{os.environ['HTTP_CACHE_BUCKET_NAME']}/cache" settings['FEED_URI'] = feed_uri settings['FEED_FORMAT'] = feed_format process = CrawlerProcess({**project_settings, **settings}) process.crawl(spider_cls, **spider_kwargs) process.start()
def crawl(settings={}, spider_name="spider", spider_kwargs={}): project_settings = get_project_settings() spider_loader = SpiderLoader(project_settings) spider_cls = spider_loader.load(spider_name) feed_uri = "" feed_format = "json" #spider output output in json format try: spider_key = urlparse( spider_kwargs.get("start_urls")[0]).hostname if spider_kwargs.get( "start_urls") else urlparse(spider_cls.start_urls[0]).hostname except Exception: logging.exception( "Spider or kwargs need start_urls." ) #try except statement to detect if no start_urls were initialised if is_in_aws(): # Lambda can only write to the /tmp folder. settings['HTTPCACHE_DIR'] = "/tmp" else: feed_uri = "file://{}/%(name)s-{}-%(time)s.json".format( #output json data to feed folder in working directory os.path.join(os.getcwd(), "feed"), spider_key, ) settings[ 'FEED_URI'] = feed_uri #settings.py contains project settings of spider settings['FEED_FORMAT'] = feed_format process = CrawlerProcess({**project_settings, **settings}) process.crawl(spider_cls, **spider_kwargs) process.start() #initiate spider web crawler
def get_crawler_class(self, crawler): """ Searches through the modules in self.__crawer_module for a crawler with the name passed along. :param str crawler: Name of the crawler to load :rtype: crawler-class """ settings = Settings() settings.set('SPIDER_MODULES', [self.__crawer_module]) spider_loader = SpiderLoader(settings) return spider_loader.load(crawler)
def main(argv): module_name = argv[1] spider_name = argv[2] env = '' if len(sys.argv) > 3: env = argv[3] settings = load_conf(module_name, env) spider_loader = SpiderLoader(settings) crawler = CrawlerProcess(settings) spider = spider_loader.load(spider_name) crawler.crawl(spider) crawler.start()
def get_crawler_class(self, crawler): """ Searches through the modules in self.__crawer_module for a crawler with the name passed along. :param str crawler: Name of the crawler to load :rtype: crawler-class """ settings = Settings() settings.set('SPIDER_MODULES', [self.__crawer_module]) spider_loader = SpiderLoader(settings) return spider_loader.load(crawler)
def run_spider_from_queue(self): if self.q.empty(): self.remaining_spiders -= 1 if self.remaining_spiders == 0: logger.debug("Stop reactor") reactor.stop() return blog = self.q.get() loader = SpiderLoader(get_project_settings()) spidercls = loader.load(self.blogs[blog]['spider']) crawler = Crawler(spidercls, get_project_settings()) crawler.signals.connect(self.spider_closed, signal=signals.spider_closed) self.runner.crawl(crawler, **self.blogs[blog], blog_name=blog)
def handle(self, *args, **options): settings = Settings({ "SPIDER_MODULES": ["scraping.spiders", "scraping.spiders.ics"], }) spider_loader = SpiderLoader(settings) # Run all spiders if none specified spiders = options["spider"] or spider_loader.list() configure_logging() runner = CrawlerRunner(settings=settings) for spider_name in spiders: runner.crawl(spider_loader.load(spider_name)) deferred = runner.join() deferred.addBoth(lambda _: reactor.stop()) reactor.run()
def run_spider(settings, itemcount, keyheader='', conid='', spider_id=0): s = Settings() s.setmodule(settings) sl = SpiderLoader(settings=s) print('spider list=', sl.list()) spider = sl.load(sl.list()[spider_id]) spider.itemcount = itemcount configure_logging({'LOG_LEVEL': 'DEBUG'}) # scrapy 로그 레벨 설정 runner = CrawlerRunner(settings=s) crawler = runner.create_crawler(spider) #if sighandler != None: #sighandler.connect(crawler) d = runner.crawl(crawler, keyheader=keyheader, conid=conid) #d = runner.crawl(spider, keyheader=keyheader, itemcount=itemcount) return d
class Updater: REQUIRED_PARAMETERS = ['MONGO_HOST', 'MONGO_PORT', 'MONGO_DB', 'SPIDERS'] def __init__(self, settings): self.__validate_settings(settings) self.settings = settings self.spiders = settings.get('SPIDERS') self.register = MongoUpdatesRegister(settings) self.register.open_db() self.spider_loader = SpiderLoader(settings) def __validate_settings(self, settings): for parameter in Updater.REQUIRED_PARAMETERS: if parameter not in settings: raise MissingSetting(parameter) def run(self): process = CrawlerProcess(self.settings) for spider in self.spiders: kwargs = self._spider_args(spider) process.crawl(spider, **kwargs) update_id = self.register.start(self.spiders) process.start() if self._failed(process): self.register.fail(update_id) else: self.register.succeed(update_id) def _spider_args(self, spider): spider_cls = self.spider_loader.load(spider) kwargs = {} if self._accepts_last(spider_cls): last = self.register.last(spider) if last is not None: kwargs['last'] = last.start return kwargs def _accepts_last(self, cls): spider_parameters = signature(cls.__init__).parameters return 'last' in spider_parameters def _failed(self, process): finish_reasons = [crawler.stats.get_value('finish_reason') for crawler in process.crawlers] return any(reason != 'finished' for reason in finish_reasons)
def crawl(settings={}, spider_name="linksExtract", spider_kwargs={}): project_settings = get_project_settings() spider_loader = SpiderLoader(project_settings) spider_cls = spider_loader.load(spider_name) feed_uri = "" feed_format = "csv" if is_in_aws(): # Lambda can only write to the /tmp folder. settings['HTTPCACHE_DIR'] = "/tmp" feed_uri = f"s3://{os.getenv('FEED_BUCKET_NAME')}/%(name)s.csv" settings['FEED_URI'] = feed_uri settings['FEED_FORMAT'] = feed_format process = CrawlerProcess({**project_settings, **settings}) process.crawl(spider_cls, **spider_kwargs) process.start() time.sleep(0.5)
def crawl(settings={}, spider_name="", key="", spider_kwargs={}): project_settings = get_project_settings() spider_loader = SpiderLoader(project_settings) spider_cls = spider_loader.load(spider_name) feed_uri = "" feed_format = "csv" spider_key = "" try: spider_key = urlparse(spider_kwargs.get("start_urls")[0]).hostname if spider_kwargs.get( "start_urls") else urlparse(spider_cls.start_urls[0]).hostname except Exception as e: logging.exception("Spider or kwargs need start_urls.") logging.exception(e) if is_in_aws(): # Lambda can only write to the /tmp folder. settings['HTTPCACHE_DIR'] = "/tmp" feed_uri = f"s3://{os.getenv('FEED_BUCKET_NAME')}/{spider_name}_{key}.csv" else: feed_uri = "file://{}/%(name)s-{}-%(time)s.json".format( os.path.join(os.getcwd(), "feed"), spider_key, ) settings['FEED_URI'] = feed_uri settings['FEED_FORMAT'] = feed_format process = CrawlerProcess({**project_settings, **settings}) process.crawl(spider_cls, **spider_kwargs) process.start() if is_in_aws() and has_task_token(): import boto3 import json client = boto3.client('stepfunctions') client.send_task_success( taskToken=os.getenv('TASK_TOKEN_ENV_VARIABLE'), output=json.dumps({"feed_uri": feed_uri}) )
def run_spider(): s = Settings() s.setmodule(ulsan_settings) #process = CrawlerProcess(get_project_settings()) sl = SpiderLoader(settings=s) print('#### spider list=', sl.list()) spider = sl.load(sl.list()[0]) #process = CrawlerProcess(settings=s) #d = process.crawl(spider) #process.crawl(UillOrKr) #process.start(stop_after_crawl=False) #process.start() #configure_logging({'LOG_FORMAT': '## %(levelname)s: %(message)s'}) #configure_logging({'LOG_LEVEL': 'DEBUG'}) runner = CrawlerRunner(settings=s) print(f'#### settings.LOG_ENABLED = {s["LOG_ENABLED"]}') d = runner.crawl(spider) #d.addBoth(lambda _: reactor.stop()) #reactor.run() #return d return d
def crawl(settings={}, spider_name="header_spider", spider_kwargs={}): project_settings = get_project_settings() spider_loader = SpiderLoader(project_settings) spider_cls = spider_loader.load(spider_name) feed_uri = "" feed_format = "json" try: spider_key = urlparse( spider_kwargs.get("start_urls")[0]).hostname if spider_kwargs.get( "start_urls") else urlparse(spider_cls.start_urls[0]).hostname except Exception: logging.exception("Spider or kwargs need start_urls.") if is_in_aws(): # Lambda can only write to the /tmp folder. settings['HTTPCACHE_DIR'] = "/tmp" feed_uri = f"s3://{os.getenv('FEED_BUCKET_NAME')}/%(name)s-{spider_key}.json" else: feed_uri = "file://{}/%(name)s-{}-%(time)s.json".format( os.path.join(os.getcwd(), "feed"), spider_key, ) if (is_in_aws() and os.getenv("USE_S3_CACHE") != "0") or os.getenv("USE_S3_CACHE"): settings[ "HTTPCACHE_STORAGE"] = "my_sls_scraper.extensions.s3cache.S3CacheStorage" settings[ "S3CACHE_URI"] = f"s3://{os.environ['HTTP_CACHE_BUCKET_NAME']}/cache" settings['FEED_URI'] = feed_uri settings['FEED_FORMAT'] = feed_format process = CrawlerProcess({**project_settings, **settings}) process.crawl(spider_cls, **spider_kwargs) process.start()
from scrapy.spiderloader import SpiderLoader from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings if __name__ == '__main__': spider_loader = SpiderLoader(get_project_settings()) spiders = spider_loader.list() process = CrawlerProcess(get_project_settings()) for spider in spiders: process.crawl(spider_loader.load(spider)) process.start()