def handle(self, *args, **options): if (not len(args) == 1) or (args[0] == u"help"): self.stdout.write(u"Usage: {0}\n".format(self.args)) self.stdout.write(self.help) else: settings = get_project_settings() settings.overrides["URLS"] = args[0] crawler = Crawler(settings) spider = GeneralSpider() crawler.configure() crawler.crawl(spider) crawler.start() log.start_from_crawler(crawler) # stop the reactor once the spider has finished crawler.signals.connect(reactor.stop, signal=signals.spider_closed) try: log.msg("Running reactor...") reactor.run() except KeyboardInterrupt: stop_reactor() finally: log.msg("Reactor stopped") log.msg("#" * 40)
def run_spider(spider): """Setups item signal and run the spider""" # set up signal to catch items scraped from scrapy import log from scrapy import signals from scrapy.xlib.pydispatch import dispatcher def catch_exception(sender, failure, response, spider): print "Response: %s [%s]" % (response.body, response.meta) sys.stdout.flush() dispatcher.connect(catch_exception, signal=signals.spider_error) def catch_resp_dld(sender, response, request, spider): print "Downloaded (%s) Response %s" % (response.status, response.url) sys.stdout.flush() dispatcher.connect(catch_resp_dld, signal=signals.response_downloaded) # settings with warnings.catch_warnings(): warnings.simplefilter("ignore") from scrapy.conf import settings as default_settings default_settings.overrides.update({ 'LOG_ENABLED': False, 'LOG_LEVEL': 'CRITICAL', 'BOT_NAME': 'project', }) # Update general settings with spider-specific ones for k,v in spider.settings.iteritems(): if isinstance(v, dict) and k in default_settings.overrides: default_settings.overrides[k].update(v) else: default_settings.overrides[k] = v # set up crawler from twisted.internet import reactor from scrapy.crawler import Crawler crawler = Crawler(default_settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.install() crawler.configure() # schedule spider crawler.crawl(spider) log.start_from_crawler(crawler) # start engine scrapy/twisted crawler.start() if not reactor.running: reactor.run() crawler.uninstall()
def run_spider(spider, settings=None): """Run a spider instance through the scrapy crawler. This function is suitable for standalone scripts. """ crawler = CrawlerProcess(_build_settings(settings)) crawler.install() crawler.configure() log.start_from_crawler(crawler) crawler.crawl(spider) crawler.start()
def run_retry_spider(): spider = retrySpider.RetrySpider() settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() log.start_from_crawler(crawler) reactor.run()
def start(): uid_list = map(lambda x: x.strip(), open('E:/PyCharm/CatPackages/resources/doc/user_500.txt').readlines()) spider = userSpider.UserSpider(uid_list=uid_list) settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() log.start_from_crawler(crawler) reactor.run()
def run_weibo_spider(): uid_list = read_uid_list('E:/PyCharm/CatPackages/resources/doc/user_500.txt') print(uid_list) spider = weiboSpider.WeiboSpider(uid_list, start='2015-03-15', end='2015-04-15') settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() log.start_from_crawler(crawler) reactor.run()
def _start_crawler(self): if not self.crawlers or self.stopping: return name, crawler = self.crawlers.popitem() self._active_crawler = crawler sflo = log.start_from_crawler(crawler) crawler.configure() crawler.install() crawler.signals.connect(crawler.uninstall, signals.engine_stopped) if sflo: crawler.signals.connect(sflo.stop, signals.engine_stopped) crawler.signals.connect(self._check_done, signals.engine_stopped) crawler.start() return name, crawler
def _setup_crawler_logging(self, crawler): log_observer = log.start_from_crawler(crawler) if log_observer: crawler.signals.connect(log_observer.stop, signals.engine_stopped)
def crawler(self): if not self.configured: log.start_from_crawler(self._crawler) self._crawler.configure() self.configured = True return self._crawler
def _setup_crawler_logging(self, crawler): log_observer = scrapy_log.start_from_crawler(crawler) if log_observer: monkey_patch_and_connect_log_observer(crawler, log_observer) if self.log_observer: monkey_patch_and_connect_log_observer(crawler, self.log_observer)
def _create_logged_crawler(self, spidercls): crawler = self._create_crawler(spidercls) log_observer = log.start_from_crawler(crawler) if log_observer: crawler.signals.connect(log_observer.stop, signals.engine_stopped) return crawler
#!/usr/bin/python2 from angellist import settings from scrapy import log from scrapy.crawler import CrawlerProcess from scrapy.settings import CrawlerSettings MySettings = CrawlerSettings(settings_module = settings) MyCrawler = CrawlerProcess(MySettings) log.start_from_crawler(MyCrawler) MyCrawler.configure() for spider_object in MyCrawler.spiders._spiders.itervalues(): MyCrawler.crawl(spider_object()) MyCrawler.start()