class CrawlerScript(): def __init__(self): self.crawler = CrawlerProcess(settings) if not hasattr(project, 'crawler'): self.crawler.install() self.crawler.configure() self.items = [] dispatcher.connect(self._item_passed, signals.item_passed) def _item_passed(self, item): self.items.append(item) def _crawl(self, queue, spider_name): spider = self.crawler.spiders.create(spider_name) if spider: self.crawler.queue.append_spider(spider) self.crawler.start() self.crawler.stop() queue.put(self.items) def crawl(self, spider): queue = Queue() p = Process(target=self._crawl, args=( queue, spider, )) p.start() p.join() return queue.get(True)
class CrawlerWorker(multiprocessing.Process): def __init__(self, spider, result_queue): multiprocessing.Process.__init__(self) self.result_queue = result_queue self.crawler = CrawlerProcess(settings) if not hasattr(project, 'crawler'): self.crawler.install() self.crawler.configure() self.items = [] self.spider = spider dispatcher.connect(self._item_passed, signals.item_passed) #__init__ def _item_passed(self, item): self.items.append(item) # _item_passed def run(self): self.crawler.crawl(self.spider) self.crawler.start() self.crawler.stop() self.result_queue.put(self.items) #run
def create_crawler(spider): '''Setups item signal and run the spider''' from scrapy import signals from scrapy.xlib.pydispatch import dispatcher def catch_item(sender, item, **kwargs): print "Got:", item dispatcher.connect(catch_item, signal=signals.item_passed) # shut off log from scrapy.conf import settings settings.overrides['LOG_ENABLED'] = False # set up crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() # schedule spider crawler.crawl(spider) return crawler
def scrapeando(): from scrapy import signals from scrapy.xlib.pydispatch import dispatcher def catch_item(sender, item, **kwargs): """Rellenamos la BD""" for i in enumerate(item.items()): x = i[0] query = "INSERT INTO book (Nombre ,Autor, Editorial ,Fecha, Precio, Link) VALUES ("+decodifica(item['Nombre'][x])+","+decodifica(item['Autor'][x])+","+decodifica(item['Editorial'][x])+","+decodifica(item['Fecha'][x])+","+decodifica(item['Precio'][x])+","+decodifica("http://www.casadellibro.com"+item['Link'][x])+");" db.micursor.execute(query) db.conexion.commit() print item dispatcher.connect(catch_item, signal=signals.item_passed) from scrapy.conf import settings settings.overrides['LOG_ENABLED'] = False from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() book = BookSpider() book.busqueda=unicode(search.getbusqueda()) crawler.crawl(book) print "Start scraping to la Casa del Libro" crawler.start() print "End scraping to la Casa del Libro" crawler.stop()
def execute(argv=None): if argv is None: argv = sys.argv crawler = CrawlerProcess(settings) crawler.install() inproject = inside_project() _check_deprecated_scrapy_ctl(argv, inproject) # TODO: remove for Scrapy 0.11 cmds = _get_commands_dict(inproject) cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \ conflict_handler='resolve') if not cmdname: _print_commands(inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.defaults.update(cmd.default_settings) cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) cmd.set_crawler(crawler) _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def main(): """Setups item signal and run the spider""" # set up signal to catch items scraped from scrapy import signals from scrapy.xlib.pydispatch import dispatcher def catch_item(sender, item, **kwargs): print "Got:", item # shut off log from scrapy.conf import settings settings.overrides['LOG_ENABLED'] = False # set up crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() # schedule spider crawler.crawl(MySpider()) # start engine scrapy/twisted print "STARTING ENGINE" crawler.start() print "ENGINE STOPPED"
def test_cp(): crawlerProcess = CrawlerProcess(scrapy_conf) crawlerProcess.install() crawlerProcess.configure() crawlerProcess.queue.append_spider(myspider) crawlerProcess.start()
def run_spider(spider, settings): """Run a spider with given settings""" from scrapy import signals from scrapy.xlib.pydispatch import dispatcher from scrapy.settings import CrawlerSettings def catch_item(sender, item, **kwargs): #log.msg("Got:" + str(item)) pass dispatcher.connect(catch_item, signal=signals.item_passed) """clean storage""" scraperwiki.sqlite.execute("drop table if exists " + spider.name) scraperwiki.sqlite.commit() from scrapy.crawler import CrawlerProcess settings = CrawlerSettings(values=settings) crawler = CrawlerProcess(settings) crawler.install() crawler.configure() crawler.crawl(spider) #log.start(loglevel='DEBUG') crawler.start()
def handle(self, *args, **options): from scrapy import signals from scrapy.xlib.pydispatch import dispatcher def catch_item(sender, item, **kwargs): print "Got:", item dispatcher.connect(catch_item, signal=signals.item_passed) from scrapy.conf import settings settings.overrides['LOG_ENABLED'] = True from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() from alescspider.spiders import * spiders = [deputado_spider.DeputadoSpider()] #spiders = [presenca_spider.PresencaSpider(), votos_spider.VotosSpider(), deputado_spider.DeputadoSpider()] for spider in spiders: crawler.queue.append_spider(spider) print "STARTING ENGINE" crawler.start() print "ENGINE STOPPED"
def main(): """Rutina principal para la ejecución del Spider""" # set up signal to catch items scraped from scrapy import signals from scrapy.xlib.pydispatch import dispatcher def catch_item(sender, item, **kwargs): print "Item Extraido:", item dispatcher.connect(catch_item, signal=signals.item_passed) from scrapy.conf import settings settings.overrides['LOG_ENABLED'] = False # setup crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() # definir el spider para el crawler crawler.crawl(BloggerSpider()) # iniciar scrapy print "STARTING ENGINE" crawler.start() print "ENGINE STOPPED"
def execute(argv=None): if argv is None: argv = sys.argv crawler = CrawlerProcess(settings) crawler.install() inproject = inside_project() cmds = _get_commands_dict(inproject) cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \ conflict_handler='resolve') if not cmdname: _print_commands(inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.defaults.update(cmd.default_settings) cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) cmd.set_crawler(crawler) _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
class CrawlerScript(): def __init__(self): self.crawler = CrawlerProcess(settings) if not hasattr(project, 'crawler'): self.crawler.install() self.crawler.configure() self.items = [] dispatcher.connect(self._item_passed, signals.item_passed) def _item_passed(self, item): self.items.append(item) def _crawl(self, queue, spider_name): spider = self.crawler.spiders.create(spider_name) if spider: self.crawler.queue.append_spider(spider) self.crawler.start() self.crawler.stop() queue.put(self.items) def crawl(self, spider): queue = Queue() p = Process(target=self._crawl, args=(queue, spider,)) p.start() p.join() return queue.get(True)
class Worker(multiprocessing.Process): def __init__(self, spider, deckbox_user, deckbox_pass): multiprocessing.Process.__init__(self) self.crawler = CrawlerProcess(settings) if not hasattr(project, 'crawler'): self.crawler.install() self.crawler.configure() self.items = [] self.spider = spider self.username = deckbox_user self.password = deckbox_pass dispatcher.connect(self._item_passed, signals.item_passed) def _item_passed(self, item): self.items.append(item) def _crawl(self): if self.spider: self.crawler.crawl(self.spider(self.username, self.password)) self.crawler.start() self.crawler.stop() def crawl(self, spider): p = Process(target=self._crawl) p.start() p.join()
def main(): """Setups item signal and run the spider""" # set up signal to catch items scraped from scrapy import signals from scrapy.xlib.pydispatch import dispatcher def catch_item(sender, item, **kwargs): print "Got:", item options = parse_args() dispatcher.connect(catch_item, signal=signals.item_passed) # shut off log from scrapy.conf import settings settings.overrides['LOG_ENABLED'] = True settings.overrides['DEPTH_LIMIT'] = 2 # set up crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() # schedule spider spider = MySpider(input=options.input, output=options.output) crawler.queue.append_spider(spider) # start engine scrapy/twisted print "STARTING ENGINE" crawler.start() print "ENGINE STOPPED"
def run_spider(spider, settings): """Run a spider with given settings""" from scrapy import signals from scrapy.xlib.pydispatch import dispatcher from scrapy.settings import CrawlerSettings def catch_item(sender, item, **kwargs): #log.msg("Got:" + str(item)) pass dispatcher.connect(catch_item, signal=signals.item_passed) """clean storage""" scraperwiki.sqlite.execute("drop table if exists "+spider.name) scraperwiki.sqlite.commit() from scrapy.crawler import CrawlerProcess settings = CrawlerSettings(values=settings) crawler = CrawlerProcess(settings) crawler.install() crawler.configure() crawler.crawl(spider) #log.start(loglevel='DEBUG') crawler.start()
def run_spider(spider, settings): from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() crawler.crawl(spider) log.start() crawler.start()
def run_spider(spider, settings): """Run a spider with given settings""" from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() crawler.crawl(spider) log.start() crawler.start()
def run_tide_scrapy(stationID, startDate, endDate, **kwargs): settings.overrides.update({}) # your settings crawlerProcess = CrawlerProcess(settings) crawlerProcess.install() crawlerProcess.configure() spider = TideSpider(stationID, startDate, endDate) crawlerProcess.crawl(spider) try: crawlerProcess.start() except: print "error"
def run_spider(spider, settings=None): """Run a spider instance through the scrapy crawler. This function is suitable for standalone scripts. """ crawler = CrawlerProcess(_build_settings(settings)) crawler.install() crawler.configure() log.start_from_crawler(crawler) crawler.crawl(spider) crawler.start()
def run_spider(spider, settings): from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() # schedule spider crawler.crawl(spider) log.start() # start engine scrapy/twisted crawler.start()
def run_wind_spider(startDate, endDate, **kwargs): wind_crawlerProcess = CrawlerProcess(settings) wind_crawlerProcess.install() wind_crawlerProcess.configure() spider2 = WindSpider("dpia1", sys.argv[1], sys.argv[2]) wind_crawlerProcess.crawl(spider2) try: wind_crawlerProcess.start() wind_crawlerProcess.stop() wind_crawlerProcess.uninstall() except Exception as e: print e
def run_river_spider(startDate, endDate, **kwargs): water_crawlerProcess = CrawlerProcess(settings) water_crawlerProcess.install() water_crawlerProcess.configure() spider = RiverSpider(sys.argv[1], sys.argv[2]) water_crawlerProcess.crawl(spider) try: water_crawlerProcess.start() water_crawlerProcess.stop() water_crawlerProcess.uninstall() except Exception as e: print e
def runscrapy(stationID, startDate, endDate, **kwargs): crawlerProcess = CrawlerProcess(settings) crawlerProcess.install() crawlerProcess.configure() spider = Spider(stationID, startDate, endDate) crawlerProcess.crawl(spider) try: crawlerProcess.start() crawlerProcess.stop() crawlerProcess.uninstall() except Exception as e: print e
def setUp(self): crawler = CrawlerProcess(settings) crawler.install() # what does this do? inside_project() self.items = [] self.crawl_cmd = scrapy.commands.crawl.Command() self.crawl_cmd.set_crawler(crawler) self.parser = optparse.OptionParser() self.crawl_cmd.add_options(self.parser) dispatcher.connect(self._item_passed, signals.item_passed)
def run_water_spider(startDate, endDate, **kwargs): water_crawlerProcess = CrawlerProcess(settings) water_crawlerProcess.install() water_crawlerProcess.configure() spider = WaterSpider("8735180", sys.argv[1], sys.argv[2]) water_crawlerProcess.crawl(spider) try: water_crawlerProcess.start() water_crawlerProcess.stop() water_crawlerProcess.uninstall() except Exception as e: print e
def execute(argv=None, settings=None): if argv is None: argv = sys.argv # --- backwards compatibility for scrapy.conf.settings singleton --- if settings is None and "scrapy.conf" in sys.modules: from scrapy import conf if hasattr(conf, "settings"): settings = conf.settings # ------------------------------------------------------------------ if settings is None: settings = get_project_settings() check_deprecated_settings(settings) # --- backwards compatibility for scrapy.conf.settings singleton --- import warnings from scrapy.exceptions import ScrapyDeprecationWarning with warnings.catch_warnings(): warnings.simplefilter("ignore", ScrapyDeprecationWarning) from scrapy import conf conf.settings = settings # ------------------------------------------------------------------ crawler = CrawlerProcess(settings) crawler.install() inproject = inside_project() cmds = _get_commands_dict(settings, inproject) cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), conflict_handler="resolve") if not cmdname: _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.defaults.update(cmd.default_settings) cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) cmd.set_crawler(crawler) _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def run_crawler(argv=None, settings=None): """Run the scrapy crawler bounded to registered spiders. This function is suitable for standalone scripts. Usage:: # mimic 'scrapy crawl' command having these two spiders available SpiderManager.register(FooSpider) SpiderManager.register(BarSpider) run_crawler() """ argv = argv or sys.argv settings = _build_settings(settings) # load spider manager from this module settings.overrides.update({ 'SPIDER_MANAGER_CLASS': '%s.%s' % (__name__, SpiderManager.__name__), }) crawler = CrawlerProcess(settings) crawler.install() parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter()) parser.add_option('-l', '--list', action='store_true', help="List available spiders") cmd = CrawlCommand() settings.defaults.update(cmd.default_settings) cmd.settings = settings cmd.add_options(parser) parser.usage = "%s %s" % (argv[0], cmd.syntax()) opts, args = parser.parse_args() if opts.list: settings.defaults.update(ListCommand.default_settings) listcmd = ListCommand() listcmd.set_crawler(crawler) listcmd.run(args, opts) sys.exit(listcmd.exitcode) else: cmdline._run_print_help(parser, cmd.process_options, args, opts) cmd.set_crawler(crawler) cmdline._run_print_help(parser, cmdline._run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def execute(argv=None, settings=None): if argv is None: argv = sys.argv # --- backwards compatibility for scrapy.conf.settings singleton --- if settings is None and 'scrapy.conf' in sys.modules: from scrapy import conf if hasattr(conf, 'settings'): settings = conf.settings # ------------------------------------------------------------------ if settings is None: settings = get_project_settings() check_deprecated_settings(settings) # --- backwards compatibility for scrapy.conf.settings singleton --- import warnings from scrapy.exceptions import ScrapyDeprecationWarning with warnings.catch_warnings(): warnings.simplefilter("ignore", ScrapyDeprecationWarning) from scrapy import conf conf.settings = settings # ------------------------------------------------------------------ crawler = CrawlerProcess(settings) crawler.install() inproject = inside_project() cmds = _get_commands_dict(settings, inproject) cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \ conflict_handler='resolve') if not cmdname: _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.defaults.update(cmd.default_settings) cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) cmd.set_crawler(crawler) _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def main_spider(): """Setups item signal and run the spider""" # set up signal to catch items scraped from scrapy import signals from scrapy.xlib.pydispatch import dispatcher def catch_item(sender, item, **kwargs): print "Got:", item dispatcher.connect(catch_item, signal=signals.item_passed) # setting setting() # set log start(logfile='log/spider/spider.log', loglevel='INFO', logstdout=False) # set up crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() # schedule spider crawler.crawl(coreSpider()) # start engine scrapy/twisted print "STARTING ENGINE" crawler.start() print '********************' c = coreSpider() getinfo = c.get_getinfo() print getinfo urls = c.get_urls() forms = c.get_forms(urls) print forms print len(c.get_urls()) print "ENGINE STOPPED" # scanner h = HTTP() a = Attack_XSS(h) tmp = a.attack(getinfo, forms) print '%%%%%%%%%%%%%%%' print 'per XSS start' p_xss = Attack_permanentXSS(h) p_xss.attack_p(getinfo, forms, tmp)
def main_spider(): """Setups item signal and run the spider""" # set up signal to catch items scraped from scrapy import signals from scrapy.xlib.pydispatch import dispatcher def catch_item(sender, item, **kwargs): print "Got:", item dispatcher.connect(catch_item, signal=signals.item_passed) # setting setting() # set log start(logfile = 'log/spider/spider.log',loglevel = 'INFO',logstdout = False) # set up crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() # schedule spider crawler.crawl(coreSpider()) # start engine scrapy/twisted print "STARTING ENGINE" crawler.start() print '********************' c = coreSpider() getinfo = c.get_getinfo() print getinfo urls = c.get_urls() forms = c.get_forms(urls) print forms print len(c.get_urls()) print "ENGINE STOPPED" # scanner h = HTTP() a = Attack_XSS(h) tmp = a.attack(getinfo,forms) print '%%%%%%%%%%%%%%%' print 'per XSS start' p_xss = Attack_permanentXSS(h) p_xss.attack_p(getinfo, forms, tmp)
def run_spider(spider, settings, loglevel='INFO'): """ Run a spider with given settings """ if 'SENTRY_DSN' in os.environ: import scrapy_sentry scrapy_sentry.init(os.environ['SENTRY_DSN']) settings.overrides.update({ 'SENTRY_SIGNALS': ['spider_error'] }) crawler = CrawlerProcess(settings) crawler.install() crawler.configure() crawler.crawl(spider) log.start(loglevel=loglevel) crawler.start()
def run_tests(spider, output_file, settings): """ Helper for running test contractors for a spider and output an XUnit file (for CI) For using offline input the HTTP cache is enabled """ settings.overrides.update({ "HTTPCACHE_ENABLED": True, "HTTPCACHE_EXPIRATION_SECS": 0, }) crawler = CrawlerProcess(settings) contracts = build_component_list( crawler.settings['SPIDER_CONTRACTS_BASE'], crawler.settings['SPIDER_CONTRACTS'], ) xunit = Xunit() xunit.enabled = True xunit.configure(AttributeDict(xunit_file=output_file), Config()) xunit.stopTest = lambda *x: None check = CheckCommand() check.set_crawler(crawler) check.settings = settings check.conman = ContractsManager([load_object(c) for c in contracts]) check.results = xunit # this are specially crafted requests that run tests as callbacks requests = check.get_requests(spider) crawler.install() crawler.configure() crawler.crawl(spider, requests) log.start(loglevel='DEBUG') # report is called when the crawler finishes, it creates the XUnit file report = lambda: check.results.report(check.results.error_report_file) dispatcher.connect(report, signals.engine_stopped) crawler.start()
class CrawlerScript(): def __init__( self ): self.crawler = CrawlerProcess( Settings() ) self.crawler.install() self.crawler.configure() def _crawl( self, queue, search ): log.start( loglevel = log.DEBUG ) current_spider = CraigslistSpider() if search: current_spider.set_search_url( search ) self.crawler.crawl( current_spider ) self.crawler.start() self.crawler.stop() queue.put( current_spider.get_object_list() ) def crawl( self, search = "" ): q = Queue() p = Process( target = self._crawl, args = ( q, search ) ) p.start() p.join() return q.get()
class CrawlerScript: def __init__(self): self.crawler = CrawlerProcess(settings) if not hasattr(project, "crawler"): self.crawler.install() self.crawler.configure() self.items = [] dispatcher.connect(self._item_passed, signals.item_passed) def _item_passed(self, item): self.items.append(item) def crawl(self, queue, spider_name): spider = self.crawler.spiders.create(spider_name) # if spider: # self.crawler.queue.append_spider(spider) # self.crawler.start() # self.crawler.stop() # queue.put(self.items) return spider.crawl()
class CrawlerScript(): def __init__(self, spider, results): self.results = results self.crawler = CrawlerProcess(settings) if not hasattr(project, 'crawler'): self.crawler.install() self.crawler.configure() self.items = [] self.spider = spider dispatcher.connect(self._item_passed, signals.item_passed) def _item_passed(self, item): self.items.append(item) def run(self): self.crawler.crawl(self.spider) self.crawler.start() self.crawler.stop() self.results.put(self.items)
class DomainCrawlerScript(): def __init__(self): self.crawler = CrawlerProcess(settings) self.crawler.install() self.crawler.configure() def _crawl(self, domain_pk): domain = Domain.objects.get( pk = domain_pk, ) urls = [] for page in domain.pages.all(): urls.append(page.url()) self.crawler.crawl(DomainSpider(urls)) self.crawler.start() self.crawler.stop() def crawl(self, domain_pk): p = Process(target=self._crawl, args=[domain_pk]) p.start() p.join()
def main(parser): """Setups item signal and run the spider""" # set up signal to catch items scraped from scrapy import signals from scrapy.xlib.pydispatch import dispatcher #shut off logging to the console def catch_item(sender, item, **kwargs): pass dispatcher.connect(catch_item, signal=signals.item_passed) # shut off log from scrapy.conf import settings settings.overrides['LOG_ENABLED'] = False settings.overrides['FEED_URI'] = 'stdout.csv' settings.overrides['FEED_FORMAT'] = 'csv' # set up crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) crawler.install() crawler.configure() args,opts = parser.parse_args() # schedule spider ext = tldextract.extract(opts[0]) allowed_domain = '.'.join(ext[1:]) spider = SeoSpider(opts[0],allowed_domain) #spider.set_url('http://www.newcustomerworkshop.com') crawler.crawl(spider) # start engine scrapy/twisted print "STARTING ENGINE" crawler.start() print "ENGINE STOPPED"
class CrawlerWorker(multiprocessing.Process): def __init__(self, spider, result_queue): multiprocessing.Process.__init__(self) self.result_queue = result_queue self.crawler = CrawlerProcess(settings) if not hasattr(project, 'crawler'): self.crawler.install() self.crawler.configure() self.items = [] self.spider = spider dispatcher.connect(self._item_passed, signals.item_passed) def _item_passed(self, item): self.items.append(item) def run(self): self.crawler.crawl(self.spider) self.crawler.start() self.crawler.stop() self.result_queue.put(self.items)
def _run_crawl_process(**kwargs): #log.start must be explicitly called log.start(loglevel=getattr(django_settings, 'SCRAPY_LOG_LEVEL', 'INFO')) # region How to run a crawler in-process # examples on how to get this stuff: # http://stackoverflow.com/questions/14777910/scrapy-crawl-from-script-always-blocks-script-execution-after-scraping?lq=1 # http://stackoverflow.com/questions/13437402/how-to-run-scrapy-from-within-a-python-script # http://stackoverflow.com/questions/7993680/running-scrapy-tasks-in-python # http://stackoverflow.com/questions/15564844/locally-run-all-of-the-spiders-in-scrapy # https://groups.google.com/forum/#!topic/scrapy-users/d4axj6nPVDw # endregion crawler = CrawlerProcess(settings) crawler.install() crawler.configure() spider = crawler.spiders.create(kwargs['spider'], **kwargs) crawler.crawl(spider) log.msg('Spider started...') crawler.start() log.msg('Spider stopped.') crawler.stop()
def run_spider(spider, settings): """Run a spider with given settings""" from scrapy import signals from scrapy.xlib.pydispatch import dispatcher from scrapy.settings import CrawlerSettings def catch_item(sender, item, **kwargs): #log.msg("Got:" + str(item)) pass dispatcher.connect(catch_item, signal=signals.item_passed) from scrapy.crawler import CrawlerProcess settings = CrawlerSettings(values=settings) crawler = CrawlerProcess(settings) crawler.install() crawler.configure() crawler.crawl(spider) #log.start(loglevel='DEBUG') crawler.start()
class ScraperTest(TestCase): SERVER_URL = 'http://*****:*****@href', from_detail_page=False) self.se_url.save() self.se_desc = ScraperElem( scraped_obj_attr=self.soa_desc, scraper=self.scraper, x_path=u'//div/div[@class="description"]/text()', from_detail_page=True, mandatory=False) self.se_desc.save() self.sched_rt = SchedulerRuntime() self.sched_rt.save() self.event_website = EventWebsite( pk=1, name=u'Event Website', scraper=self.scraper, url=os.path.join(self.SERVER_URL, 'site_generic/event_main.html'), scraper_runtime=self.sched_rt, ) self.event_website.save() settings.overrides['ITEM_PIPELINES'] = [ 'dynamic_scraper.pipelines.DjangoImagesPipeline', 'dynamic_scraper.pipelines.ValidationPipeline', 'scraper.scraper_test.DjangoWriterPipeline', ] settings.overrides['IMAGES_STORE'] = os.path.join( self.PROJECT_ROOT, 'imgs') settings.overrides['IMAGES_THUMBS'] = { 'small': (170, 170), } self.crawler = CrawlerProcess(settings) self.crawler.install() self.crawler.configure() for name, signal in vars(signals).items(): if not name.startswith('_'): dispatcher.connect(self.record_signal, signal) def tearDown(self): pass