class ScrapySpider: def __init__(self): self.spider = HqSpider() self.crawler = crawler = Crawler(get_project_settings()) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(self.spider) dispatcher.connect(self._dont_close_me, signals.spider_idle) self.thread = None self._started = False self._stopped = False def start(self): def run(): try: logging.info('Start spider') reactor.run(installSignalHandlers=False) except Exception, e: print traceback.format_exc() if not self._started: self._started = True self.crawler.start() log.start_from_settings(get_project_settings()) self.thread = Thread(target=run) log.msg('Start') self.thread.start() else: raise Exception('spider has already started.')
def __init__(self, settings): super(CrawlerProcess, self).__init__(settings) install_shutdown_handlers(self._signal_shutdown) self.stopping = False self.log_observer = log.start_from_settings(self.settings) log.scrapy_info(settings)
__author__ = 'LeoDong' from twisted.internet import reactor from scrapy.crawler import Crawler from scrapy import log, signals from scrapy.utils.project import get_project_settings from SAECrawlers.spiders.Updater import Updater spider = Updater() settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() log.start_from_settings(settings, crawler) reactor.run()
def start(self, stop_after_crawl=True, start_reactor=True): self.log_observer = log.start_from_settings(self.settings) log.scrapy_info(self.settings) if start_reactor: self._start_reactor(stop_after_crawl)
from __future__ import division, absolute_import, print_function, unicode_literals from inspect import isclass from twisted.internet import reactor, defer from scrapy import log from scrapy.settings import CrawlerSettings from scrapy.crawler import Crawler from scrapy.spidermanager import SpiderManager from oucfeed.crawler import settings, datastore, history from oucfeed.crawler.uploader import upload crawler_settings = CrawlerSettings(settings) log.start_from_settings(crawler_settings) spidermanager = SpiderManager.from_settings(crawler_settings) def setup_output(): crawler_settings.overrides['FEED_URI'] = 'test.js' crawler_settings.overrides['FEED_FORMAT'] = 'js' def init_spider(spider): if isinstance(spider, basestring): spider = spidermanager.create(spider) elif isclass(spider): spider = spider() return spider
def crawler(self): if not log.started: log.start_from_settings(self.settings) self._crawler.configure() return self._crawler