def crawl(settings={}, spider_name='all'): project_settings = project.get_project_settings() spider_loader = SpiderLoader.from_settings(project_settings) feed_uri = "" feed_format = "json" spider_classes = [] if not spider_name.lower() == 'all': spider_class = spider_loader.load(spider_name) spider_classes.append(spider_class) else: spiders = spider_loader.list() for name in spiders: spider_class = spider_loader.load(name) spider_classes.append(spider_class) if is_in_aws(): settings['HTTPCACHE_DIR'] = '/tmp' bucket = os.getenv('FEED_BUCKET_NAME') feed_uri = "s3://{}/%(name)s-%(time)s.{}".format(bucket, feed_format) else: output = os.path.join(os.getcwd(), "output") feed_uri = "file://{}/%(name)s-%(time)s.{}".format(output, feed_format) settings['FEED_URI'] = feed_uri settings['FEED_FORMAT'] = feed_format configure_logging() for spider in spider_classes: run_spider({**project_settings, **settings}, spider)
def test_multiple_dupename_warning(self): # copy 2 spider modules so as to have duplicate spider name # This should issue 2 warning, 1 for each duplicate spider name shutil.copyfile( os.path.join(self.tmpdir, 'test_spiders_xxx/spider1.py'), os.path.join(self.tmpdir, 'test_spiders_xxx/spider1dupe.py')) shutil.copyfile( os.path.join(self.tmpdir, 'test_spiders_xxx/spider2.py'), os.path.join(self.tmpdir, 'test_spiders_xxx/spider2dupe.py')) with warnings.catch_warnings(record=True) as w: spider_loader = SpiderLoader.from_settings(self.settings) self.assertEqual(len(w), 1) msg = str(w[0].message) self.assertIn("several spiders with the same name", msg) self.assertIn("'spider1'", msg) self.assertTrue(msg.count("'spider1'") == 2) self.assertIn("'spider2'", msg) self.assertTrue(msg.count("'spider2'") == 2) self.assertNotIn("'spider3'", msg) self.assertNotIn("'spider4'", msg) spiders = set(spider_loader.list()) self.assertEqual(spiders, {'spider1', 'spider2', 'spider3', 'spider4'})
def setUp(self): orig_spiders_dir = os.path.join(module_dir, 'test_spiders') self.tmpdir = tempfile.mkdtemp() self.spiders_dir = os.path.join(self.tmpdir, 'test_spiders_xxx') shutil.copytree(orig_spiders_dir, self.spiders_dir) sys.path.append(self.tmpdir) settings = Settings({'SPIDER_MODULES': ['test_spiders_xxx']}) self.spider_loader = SpiderLoader.from_settings(settings)
def setUp(self): orig_spiders_dir = os.path.join(module_dir, 'test_spiders') self.tmpdir = tempfile.mkdtemp() self.spiders_dir = os.path.join(self.tmpdir, 'test_spiders_xxx') shutil.copytree(orig_spiders_dir, self.spiders_dir) sys.path.append(self.tmpdir) settings = Settings({'SPIDER_MODULES': ['test_spiders_xxx']}) self.spider_loader = SpiderLoader.from_settings(settings)
def setUp(self): orig_spiders_dir = os.path.join(module_dir, "test_spiders") self.tmpdir = self.mktemp() os.mkdir(self.tmpdir) self.spiders_dir = os.path.join(self.tmpdir, "test_spiders_xxx") shutil.copytree(orig_spiders_dir, self.spiders_dir) sys.path.append(self.tmpdir) settings = Settings({"SPIDER_MODULES": ["test_spiders_xxx"]}) self.spider_loader = SpiderLoader.from_settings(settings)
def test_bad_spider_modules_warning(self): with warnings.catch_warnings(record=True) as w: module = 'tests.test_spiderloader.test_spiders.doesnotexist' settings = Settings({'SPIDER_MODULES': [module]}) spider_loader = SpiderLoader.from_settings(settings) self.assertIn("Could not load spiders from module", str(w[0].message)) spiders = spider_loader.list() self.assertEqual(spiders, [])
def test_dupename_warning(self): # copy 1 spider module so as to have duplicate spider name shutil.copyfile(os.path.join(self.tmpdir, 'test_spiders_xxx/spider3.py'), os.path.join(self.tmpdir, 'test_spiders_xxx/spider3dupe.py')) with warnings.catch_warnings(record=True) as w: spider_loader = SpiderLoader.from_settings(self.settings) self.assertEqual(len(w), 1) msg = str(w[0].message) self.assertIn("several spiders with the same name", msg) self.assertIn("'spider3'", msg) spiders = set(spider_loader.list()) self.assertEqual(spiders, set(['spider1', 'spider2', 'spider3', 'spider4']))
def test_bad_spider_modules_warning(self): with warnings.catch_warnings(record=True) as w: module = 'tests.test_spiderloader.test_spiders.doesnotexist' settings = Settings({ 'SPIDER_MODULES': [module], 'SPIDER_LOADER_WARN_ONLY': True }) spider_loader = SpiderLoader.from_settings(settings) if str(w[0].message).startswith("_SixMetaPathImporter"): # needed on 3.10 because of https://github.com/benjaminp/six/issues/349, # at least until all six versions we can import (including botocore.vendored.six) # are updated to 1.16.0+ w.pop(0) self.assertIn("Could not load spiders from module", str(w[0].message)) spiders = spider_loader.list() self.assertEqual(spiders, [])
def cmdline_crawl(args): from scrapy.utils.project import get_project_settings from scrapy.spiderloader import SpiderLoader settings = get_project_settings() spiders = SpiderLoader.from_settings(settings) if not args: spiderlist = spiders.list() if spiderlist: print('spiders list {}'.format(spiderlist)) sys.exit() spidername = args.spider filepath = inspect.getabsfile(spiders.load(spidername)) os.environ.pop('SCRAPY_SETTINGS_MODULE') settings, _conf = _get_settings_and_conf(args) server = connection.get_redis(**settings['REDIS_PARAMS']) with open(filepath,encoding='utf-8') as f: script = f.read() jsondata = _send_script_start_work(spidername, script, server) jsondata.pop('script') print('send task:') print(json.dumps(jsondata,indent=4))
def _prepare_domains_items_refs(self): spider_loader = SpiderLoader.from_settings(self.settings) if hasattr(self, 'spiders'): spider_names = getattr(self, 'spiders').split(',') else: spider_names = [spider_name for spider_name in spider_loader.list() \ if spider_name not in self.spiders_ignored] for spider_name in spider_names: Spider = spider_loader.load(spider_name) for domain in Spider.allowed_domains: for i, item_ref in enumerate(Spider.items_refs): item_ref['spider_name'] = spider_name Spider.items_refs[i] = item_ref self.domains_items_refs[domain] = Spider.items_refs self.allowed_domains += Spider.allowed_domains self.allowed_domains.sort(key=len, reverse=True)
from scrapy.extensions.httpcache import FilesystemCacheStorage from scrapy.spiderloader import SpiderLoader from scrapy.utils.project import get_project_settings SETTINGS = get_project_settings() STORAGE = FilesystemCacheStorage(SETTINGS) SPIDER_LOADER = SpiderLoader.from_settings(SETTINGS) SPIDER_CLASS = SPIDER_LOADER.load('ksdata') SPIDER = SPIDER_CLASS(file='../sample.csv') def get_busted_caches(): for request in SPIDER.start_requests(): meta = STORAGE._read_meta(SPIDER, request) if meta['status'] in SETTINGS['HTTPCACHE_IGNORE_HTTP_CODES']: yield STORAGE._get_request_path(SPIDER, request) if __name__ == "__main__": import shutil for path in get_busted_caches(): shutil.rmtree(path)
def test_load_base_spider(self): module = 'tests.test_spiderloader.test_spiders.spider0' settings = Settings({'SPIDER_MODULES': [module]}) self.spider_loader = SpiderLoader.from_settings(settings) assert len(self.spider_loader._spiders) == 0
def test_load_spider_module(self): prefix = 'tests.test_spiderloader.test_spiders.' module = ','.join(prefix + s for s in ('spider1', 'spider2')) settings = Settings({'SPIDER_MODULES': module}) self.spider_loader = SpiderLoader.from_settings(settings) assert len(self.spider_loader._spiders) == 2
from scrapy.spiderloader import SpiderLoader from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings settings = get_project_settings() process = CrawlerProcess(settings) spider_loader = SpiderLoader.from_settings(settings) ranking_spider = spider_loader.load("d1rankings") print("----------------Ranking spider started----------------") process.crawl(ranking_spider) process.start() print("----------------Ranking spider finished----------------\n")
def test_load_spider_module(self): prefix = "tests.test_spiderloader.test_spiders." module = ",".join(prefix + s for s in ("spider1", "spider2")) settings = Settings({"SPIDER_MODULES": module}) self.spider_loader = SpiderLoader.from_settings(settings) assert len(self.spider_loader._spiders) == 2
def main(): logger.warn("Loading project") s3_bucket = os.environ.get('S3_BUCKET') assert s3_bucket, "Please specify an S3_BUCKET environment variable" utcnow = datetime.datetime.utcnow() tstamp = utcnow.strftime('%F-%H-%M-%S') pool_size = 12 settings = get_project_settings() spider_loader = SpiderLoader.from_settings(settings) spider_names = spider_loader.list() pool = multiprocessing.Pool(pool_size, maxtasksperchild=1) logger.info("Starting to crawl %s spiders on %s processes", len(spider_names), pool_size) results = pool.imap_unordered(run_one_spider, spider_names) pool.close() pool.join() logger.info("Done crawling") # The imap_unordered call returns an iterator, so throw it in a list results = list(results) client = boto3.client('s3') s3_key_prefix = "runs/{}".format(tstamp) # Concatenate and gzip the output geojsons _, output_gz_filename = tempfile.mkstemp('.geojson.gz') with gzip.open(output_gz_filename, 'wb') as f_out: for r in results: with open(r.pop('output_filename'), 'rb') as f_in: shutil.copyfileobj(f_in, f_out) s3_output_size = os.path.getsize(output_gz_filename) s3_output_size_mb = s3_output_size / 1024 / 1024 # Post it to S3 s3_output_key = '{}/output.geojson.gz'.format(s3_key_prefix) client.upload_file( output_gz_filename, s3_bucket, s3_output_key, ExtraArgs={ 'ACL': 'public-read', 'ContentType': 'application/json', 'ContentDisposition': 'attachment; filename="output-{}.geojson.gz"'.format(tstamp), }) logger.warn("Saved output to https://s3.amazonaws.com/%s/%s", s3_bucket, s3_output_key) # Concatenate and gzip the log files _, log_gz_filename = tempfile.mkstemp('.log.gz') with gzip.open(log_gz_filename, 'wb') as f_out: for r in results: with open(r.pop('log_filename'), 'rb') as f_in: shutil.copyfileobj(f_in, f_out) # Post it to S3 s3_log_key = '{}/all_logs.txt.gz'.format(s3_key_prefix) client.upload_file(log_gz_filename, s3_bucket, s3_log_key, ExtraArgs={ 'ACL': 'public-read', 'ContentType': 'text/plain; charset=utf-8', 'ContentEncoding': 'gzip', }) logger.warn("Saved logfile to https://s3.amazonaws.com/%s/%s", s3_bucket, s3_log_key) metadata = { 'spiders': results, 'links': { 'download_url': "https://s3.amazonaws.com/{}/{}".format(s3_bucket, s3_output_key), 'log_url': "https://s3.amazonaws.com/{}/{}".format(s3_bucket, s3_log_key), } } with open('metadata.json', 'w') as f: json.dump(metadata, f, indent=2, default=json_serial) s3_key = '{}/metadata.json'.format(s3_key_prefix) client.upload_file('metadata.json', s3_bucket, s3_key, ExtraArgs={ 'ACL': 'public-read', 'ContentType': 'application/json; charset=utf-8', }) logger.warn("Saved metadata to https://s3.amazonaws.com/%s/%s", s3_bucket, s3_key) s3_key = 'runs/latest/metadata.json' client.upload_file('metadata.json', s3_bucket, s3_key, ExtraArgs={ 'ACL': 'public-read', 'ContentType': 'application/json; charset=utf-8', }) logger.warn("Saved metadata to https://s3.amazonaws.com/%s/%s", s3_bucket, s3_key) total_count = sum(filter(None, (s['item_scraped_count'] for s in results))) template_content = { 'download_url': 'https://s3.amazonaws.com/{}/{}'.format(s3_bucket, s3_output_key), 'download_size': round(s3_output_size_mb, 1), 'row_count': total_count, 'spider_count': len(results), 'updated_datetime': utcnow.replace(microsecond=0).isoformat(), } with open('info_embed.html', 'w') as f: f.write("<html><body>" "<a href=\"{download_url}\">Download</a> " "({download_size} MB)<br/><small>{row_count:,} rows from " "{spider_count} spiders, updated {updated_datetime}Z</small>" "</body></html>\n".format(**template_content)) s3_key = 'runs/latest/info_embed.html' client.upload_file('info_embed.html', s3_bucket, s3_key, ExtraArgs={ 'ACL': 'public-read', 'ContentType': 'text/html; charset=utf-8', }) logger.warn("Saved embed to https://s3.amazonaws.com/%s/%s", s3_bucket, s3_key)
def test_load_spider_module_multiple(self): prefix = 'tests.test_spiderloader.test_spiders.' module = ','.join(prefix + s for s in ('spider1', 'spider2')) settings = Settings({'SPIDER_MODULES': module}) self.spider_loader = SpiderLoader.from_settings(settings) assert len(self.spider_loader._spiders) == 2
def __init__(self): settings = get_scraper_settings() loader = SpiderLoader.from_settings(settings) self.spider_names = loader.list() self.server = redis.StrictRedis(host=settings.get('REDIS_HOST'), port=settings.get('REDIS_PORT'))
def list_spiders(self): loader = SpiderLoader.from_settings(self.settings) return loader.list()
def test_load_base_spider(self): module = 'tests.test_spiderloader.test_spiders.spider0' settings = Settings({'SPIDER_MODULES': [module]}) self.spider_loader = SpiderLoader.from_settings(settings) assert len(self.spider_loader._spiders) == 0
def test_load_spider_module(self): module = "tests.test_spiderloader.test_spiders.spider1" settings = Settings({"SPIDER_MODULES": [module]}) self.spider_loader = SpiderLoader.from_settings(settings) assert len(self.spider_loader._spiders) == 1