def main(port, host, start_manhole, manhole_port, manhole_host, loglevel, opts): from arachnado.handlers import get_application from arachnado.crawler_process import ArachnadoCrawlerProcess from arachnado import manhole settings = {'LOG_LEVEL': loglevel} crawler_process = ArachnadoCrawlerProcess(settings) app = get_application(crawler_process, opts) app.listen(int(port), host) if start_manhole: manhole.start(manhole_port, manhole_host, {'cp': crawler_process}) crawler_process.start(stop_after_crawl=False)
def main(port, host, start_manhole, manhole_port, manhole_host, loglevel, opts): from arachnado.handlers import get_application from arachnado.crawler_process import ArachnadoCrawlerProcess from arachnado import manhole settings = {'LOG_LEVEL': loglevel} crawler_process = ArachnadoCrawlerProcess(settings) app = get_application(crawler_process, opts) app.listen(int(port), host) logger.info("Arachnado v%s is started on %s:%s" % (__version__, host, port)) if start_manhole: manhole.start(manhole_port, manhole_host, {'cp': crawler_process}) logger.info("Manhole server is started on %s:%s" % ( manhole_host, manhole_port)) crawler_process.start(stop_after_crawl=False)
def main(port, host, start_manhole, manhole_port, manhole_host, loglevel, opts): from arachnado.handlers import get_application from arachnado.crawler_process import ArachnadoCrawlerProcess from arachnado import manhole settings = {'LOG_LEVEL': loglevel} crawler_process = ArachnadoCrawlerProcess(settings) app = get_application(crawler_process, opts) app.listen(int(port), host) logger.info("Arachnado v%s is started on %s:%s" % (__version__, host, port)) if start_manhole: manhole.start(manhole_port, manhole_host, {'cp': crawler_process}) logger.info("Manhole server is started on %s:%s" % (manhole_host, manhole_port)) crawler_process.start(stop_after_crawl=False)
def main(port, host, start_manhole, manhole_port, manhole_host, loglevel, opts): from arachnado.handlers import get_application from arachnado.crawler_process import ArachnadoCrawlerProcess from arachnado.site_checker import get_site_checker_crawler from arachnado.storages.mongo import MongoStorage from arachnado.storages.mongotail import MongoTailStorage from arachnado.domain_crawlers import DomainCrawlers from arachnado.cron import Cron settings = { 'LOG_LEVEL': loglevel, } # mongo export options storage_opts = opts['arachnado.storage'] assert storage_opts['enabled'], "Storage can't be turned off" items_uri = _getval(storage_opts, 'items_uri_env', 'items_uri') jobs_uri = _getval(storage_opts, 'jobs_uri_env', 'jobs_uri') sites_uri = _getval(storage_opts, 'sites_uri_env', 'sites_uri') scrapy_opts = opts['arachnado.scrapy'] settings.update({k: v for k, v in scrapy_opts.items() if k.isupper()}) settings.update({ 'MONGO_EXPORT_ENABLED': storage_opts['enabled'], 'MONGO_EXPORT_JOBS_URI': jobs_uri, 'MONGO_EXPORT_ITEMS_URI': items_uri, }) job_storage = MongoTailStorage(jobs_uri, cache=True) job_storage.ensure_index("urls") site_storage = MongoStorage(sites_uri, cache=True) item_storage = MongoTailStorage(items_uri) item_storage.ensure_index("url") item_storage.ensure_index("_job_id") crawler_process = ArachnadoCrawlerProcess(settings) site_checker_crawler = get_site_checker_crawler(site_storage) crawler_process.crawl(site_checker_crawler) spider_packages = scrapy_opts['spider_packages'] default_spider_name = scrapy_opts['default_spider_name'] domain_crawlers = DomainCrawlers( crawler_process=crawler_process, spider_packages=_parse_spider_packages(spider_packages), default_spider_name=default_spider_name, settings=settings) domain_crawlers.resume(job_storage) cron = Cron(domain_crawlers, site_storage) cron.start() app = get_application(crawler_process, domain_crawlers, site_storage, item_storage, job_storage, opts) app.listen(int(port), host) logger.info("Arachnado v%s is started on %s:%s" % (__version__, host, port)) if start_manhole: from arachnado import manhole manhole.start(manhole_port, manhole_host, {'cp': crawler_process}) logger.info("Manhole server is started on %s:%s" % (manhole_host, manhole_port)) crawler_process.start(stop_after_crawl=False)