Exemplo n.º 1
0
class ScrapySpider:
    def __init__(self):
        self.spider = HqSpider()
        self.crawler = crawler = Crawler(get_project_settings())
        crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
        crawler.configure()
        crawler.crawl(self.spider)
        dispatcher.connect(self._dont_close_me, signals.spider_idle)
        self.thread = None
        self._started = False
        self._stopped = False

    def start(self):
        def run():
            try:
                logging.info('Start spider')
                reactor.run(installSignalHandlers=False)
            except Exception, e:
                print traceback.format_exc()

        if not self._started:
            self._started = True
            self.crawler.start()
            log.start_from_settings(get_project_settings())
            self.thread = Thread(target=run)
            log.msg('Start')
            self.thread.start()
        else:
            raise Exception('spider has already started.')
Exemplo n.º 2
0
 def __init__(self, settings):
     super(CrawlerProcess, self).__init__(settings)
     install_shutdown_handlers(self._signal_shutdown)
     self.stopping = False
     self.log_observer = log.start_from_settings(self.settings)
     log.scrapy_info(settings)
Exemplo n.º 3
0
 def __init__(self, settings):
     super(CrawlerProcess, self).__init__(settings)
     install_shutdown_handlers(self._signal_shutdown)
     self.stopping = False
     self.log_observer = log.start_from_settings(self.settings)
     log.scrapy_info(settings)
Exemplo n.º 4
0
__author__ = 'LeoDong'
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from scrapy.utils.project import get_project_settings

from SAECrawlers.spiders.Updater import Updater

spider = Updater()
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start_from_settings(settings, crawler)
reactor.run()
Exemplo n.º 5
0
 def start(self, stop_after_crawl=True, start_reactor=True):
     self.log_observer = log.start_from_settings(self.settings)
     log.scrapy_info(self.settings)
     if start_reactor:
         self._start_reactor(stop_after_crawl)
Exemplo n.º 6
0
from __future__ import division, absolute_import, print_function, unicode_literals

from inspect import isclass

from twisted.internet import reactor, defer
from scrapy import log
from scrapy.settings import CrawlerSettings
from scrapy.crawler import Crawler
from scrapy.spidermanager import SpiderManager

from oucfeed.crawler import settings, datastore, history
from oucfeed.crawler.uploader import upload


crawler_settings = CrawlerSettings(settings)
log.start_from_settings(crawler_settings)
spidermanager = SpiderManager.from_settings(crawler_settings)


def setup_output():
    crawler_settings.overrides['FEED_URI'] = 'test.js'
    crawler_settings.overrides['FEED_FORMAT'] = 'js'


def init_spider(spider):
    if isinstance(spider, basestring):
        spider = spidermanager.create(spider)
    elif isclass(spider):
        spider = spider()
    return spider
Exemplo n.º 7
0
 def crawler(self):
     if not log.started:
         log.start_from_settings(self.settings)
     self._crawler.configure()
     return self._crawler
Exemplo n.º 8
0
 def start(self, stop_after_crawl=True, start_reactor=True):
     self.log_observer = log.start_from_settings(self.settings)
     log.scrapy_info(self.settings)
     if start_reactor:
         self._start_reactor(stop_after_crawl)
Exemplo n.º 9
0
 def crawler(self):
     if not log.started:
         log.start_from_settings(self.settings)
     self._crawler.configure()
     return self._crawler