def setup_crawler(origem,destino,ano_saida,mes_saida,dia_saida,ano_chegada,mes_chegada,dia_chegada): spider = SubmarinoSpiderSpider(origem=origem,destino=destino,ano_saida=ano_saida,mes_saida=mes_saida,dia_saida=dia_saida, ano_chegada=ano_chegada,mes_chegada=mes_chegada,dia_chegada=dia_chegada,user_browser=random_header()) crawler = Crawler(Settings()) crawler.configure() crawler.crawl(spider) crawler.start()
def setup_crawler(self, spider): crawler = Crawler(get_project_settings()) crawler.signals.connect(self.spider_closed, signals.spider_closed) crawler.configure() crawler.crawl(spider) self.crawler = crawler self.crawler.start()
def goGrabSomeBags(): spider = PriceWatcherSpider(domain='barneys.com') crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start()
def crawl(cls, sites): stat = {"spiders": 0} def soft_stop_reactor(): stat["spiders"] -= 1 if not stat["spiders"]: reactor.stop() for site in sites: try: spider = site.parser.spider(site) except (NotImplementedError, ObjectDoesNotExist): logger.error(_('Spider not implemented for "%s" site', site.label)) else: stat["spiders"] += 1 with spider_project(spider) as settings: crawler = Crawler(settings) crawler.signals.connect(soft_stop_reactor, signal=signals.spider_closed) # reactor.stop crawler.configure() crawler.crawl(spider) crawler.start() logfile = open('crawl.log', 'w') log_observer = log.ScrapyFileLogObserver(logfile, level=logging.INFO) log_observer.start() # the script will block here until the spider_closed signal was sent reactor.run()
def setupCrawler(spider): crawler = Crawler(settings) crawler.configure() crawler.signals.connect(crawler_started, signals.engine_started) crawler.signals.connect(crawler_stopped, signals.engine_stopped) crawler.crawl(crawler.spiders.create(spider)) crawler.start()
class SWACrawlerScript(object): def __init__(self, origin, destination, date, debug=False, defaultSettings=True): self.debug = debug self.origin = origin self.destination = destination self.date = date # initialize spider self.spider = SWAFareSpider(self.origin, self.date, self.destination) # initialize settings settingValues = self.loadSettings() if defaultSettings else dict() self.settings = Settings(values=settingValues) # initialize crawler self.crawler = Crawler(self.settings) self.crawler.configure() print "Set up" def loadSettings(self): settingsList = [i for i in dir(swa.settings) if i[0] != "_"] settingsDict = {} for s in settingsList: # yikes settingsDict[s] = eval("swa.settings.%s" % s) return settingsDict def run(self): print "Running" self.crawler.crawl(self.spider) self.crawler.start() if ( self.debug ): log.start(loglevel=log.DEBUG) reactor.run()
def __crawl(self, hiddenWebSite, localPort, extraPath='', crawlImages=True, crawlLinks=True,crawlContents=True, crawlFormData=True): def catch_item(sender, item, **kwargs): item['url'] = item['url'].replace('http://127.0.0.1:'+str(localPort)+extraPath, hiddenWebSite) print "[+]Processing URL %s ... " %(item['url']) from core.tortazo.databaseManagement.TortazoDatabase import TortazoDatabase database = TortazoDatabase() database.initDatabaseDeepWebCrawlerPlugin() self.__processPage(item, database) # setup crawler dispatcher.connect(catch_item, signal=signals.item_passed) dispatcher.connect(reactor.stop, signal=signals.spider_closed) settings = get_project_settings() settings.set('ITEM_PIPELINES', {'scrapy.contrib.pipeline.images.ImagesPipeline': 1}, priority='cmdline') settings.set('IMAGES_STORE', config.deepWebCrawlerOutdir+hiddenWebSite) crawler = Crawler(settings) crawler.configure() spider = HiddenSiteSpider("http://127.0.0.1:"+str(localPort)+extraPath, hiddenWebSite, self.extractorRules) spider.setImages(crawlImages) spider.setLinks(crawlLinks) spider.setContents(crawlContents) spider.setForms(crawlFormData) crawler.crawl(spider) print "\n[+] Starting scrapy engine... this process could take some time, depending on the crawling and extractor rules applied... \n" crawler.start() reactor.run() print "[+] Crawler finished."
def test_priorization(self): webdriver = Mock() settings = self.settings(WEBDRIVER_BROWSER=webdriver) webdriver.get.side_effect = self._wait webdriver.page_source = u'' dispatcher.connect(self._stop_reactor, signal=signals.spider_closed) crawler = Crawler(Settings(values=settings)) crawler.configure() spider = self.Spider(name='test', domain='testdomain') crawler.crawl(spider) crawler.start() log.start(loglevel='ERROR') reactor.run() # I suspect web actions may be broken... assert webdriver.get.mock_calls == [ call('http://testdomain/path?wr=0'), call('http://testdomain/path?wr=0&wa=0'), call('http://testdomain/path?wr=0&wa=1'), call('http://testdomain/path?wr=1'), call('http://testdomain/path?wr=1&wa=0'), call('http://testdomain/path?wr=1&wa=1'), #call('http://testdomain/path?wr=0&wa=0&wr=0'), call('http://testdomain/path?wr=0&wa=1&wr=0'), call('http://testdomain/path?wr=0&wa=1&wr=0'), #call('http://testdomain/path?wr=1&wa=0&wr=0'), call('http://testdomain/path?wr=1&wa=1&wr=0'), call('http://testdomain/path?wr=1&wa=1&wr=0') ]
def setup_crawler( spider_class, **kwargs ): """ Use scrapy in a script see http://doc.scrapy.org/en/latest/topics/practices.html :param spider_class: Spider class to test :type spider_class: text """ def add_item(item): items.append(item) items = [] # create Crawler settings = get_project_settings() crawler = Crawler(settings) crawler.configure() # connect collecting function on item_passed crawler.signals.connect(add_item, signals.item_passed) # create & connect spider spider = spider_class(**kwargs) crawler.crawl(spider) # start crawler log.start() crawler.start() # run crawler task.deferLater(reactor, 1, reactor.stop) reactor.run() return items
def setup_crawler(ticker): spider = StatsSpider(ticker=ticker) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start()
def setup_crawler(id="550", publisher="rbd"): spider = DmmQuerySpider(id, publisher) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start()
def setup(self): settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.signals.connect(self._next_crawl, signal=signals.spider_closed) crawler.crawl(self.spider) crawler.start()
def setup_crawler(user, website, validator_set, parameters): spider = WebQualitySpider(user=user, website=website, validators=validator_set, parameters=parameters) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start()
class startPageSpiderService(service.Service): def __init__(self, parent): self.spiderService = parent self._crawler = Crawler(settings) self._crawler.configure() self._spider = startPageSpider(taskId=self.spiderService.taskId) def getStats(self): return self._crawler.stats.get_stats() def startService(self): service.Service.startService(self) #dispatcher.connect(self.stopService, signals.spider_closed) self._crawler.signals.connect(self.stopService, signals.spider_closed) # self._crawler.signals.connect(self.test2, 'writeListQuque') #_startPageSpider = startPageSpider(taskId=self.spiderService.taskId) self._crawler.crawl(self._spider) #self._crawler.start() self.startCrawl() def startCrawl(self): if not self._crawler.engine.running: self._crawler.start() # def test2(self): # print '================>111111111111111111111111<==========================' def stopService(self): log.msg(format='startPageSpiderService->stopService stop startPageSpiderService serviceName=(%(serviceName)s)',serviceName=self.name) service.Service.stopService(self) self.spiderService.removeSpiderService() self._crawler.stop() if self.name in self.spiderService.namedServices: self.spiderService.removeService(self)
def setup_crawler(self, supermarket, reactor_control): """Set up the Scrapy crawler. See http://doc.scrapy.org/en/latest/topics/practices.html#run-scrapy-from-a-script. Keyword arguments: supermarket -- the supermarket whose crawler should be set up """ cachefile = supermarket_filename(supermarket) if isfile(cachefile): remove(cachefile) settings = get_project_settings() url = supermarket_url(supermarket) settings.set('FEED_URI', supermarket_filename(supermarket)) spider = MySupermarketSpider(url) crawler = Crawler(settings) crawler.signals.connect(reactor_control.remove_crawler, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() reactor_control.add_crawler()
def main(): """Setups item signal and run the spider""" from twisted.internet import reactor from scrapy import signals from scrapy.settings import Settings from scrapy.crawler import Crawler def catch_item(sender, item, **kwargs): print "Got:", item settings = Settings() # set up crawler crawler = Crawler(settings) # shut off log crawler.settings.set('LOG_ENABLED', False, priority='cmdline') # set up signal to catch items scraped crawler.signals.connect(catch_item, signal=signals.item_passed) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.install() crawler.configure() # schedule spider spider = MySpider() crawler.crawl(spider) # start engine scrapy/twisted print "STARTING ENGINE" crawler.start() reactor.run() print "ENGINE STOPPED"
class LinkGetter(scrapy.Spider): def __init__(self, domain): """ domain_list: a list of domains to extract links from (must start with http://) """ self.domain = domain self.scrapy = scrapy.Spider(self.domain) self.scrapy.allowed_domains = self.scrapy.name self.scrapy.start_urls = [self.scrapy.name] self.settings = get_project_settings() self.crawler = Crawler(settings) self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed) self.crawler.configure() self.scrapy.crawl(spider) self.scrapy.start() log.start() reactor.run() self.links = [] def parse(self, response): self.links.extend(LinkExtractor.extract_links(response)) """
def test_priorization(self): webdriver = Mock() settings = self.settings(WEBDRIVER_BROWSER=webdriver) webdriver.get.side_effect = self._wait webdriver.page_source = u"" dispatcher.connect(self._stop_reactor, signal=signals.spider_closed) crawler = Crawler(Settings(values=settings)) crawler.configure() spider = self.Spider(name="test", domain="testdomain") crawler.crawl(spider) crawler.start() log.start(loglevel="ERROR") reactor.run() # I suspect web actions may be broken... assert webdriver.get.mock_calls == [ call("http://testdomain/path?wr=0"), call("http://testdomain/path?wr=0&wa=0"), call("http://testdomain/path?wr=0&wa=1"), call("http://testdomain/path?wr=1"), call("http://testdomain/path?wr=1&wa=0"), call("http://testdomain/path?wr=1&wa=1"), # call('http://testdomain/path?wr=0&wa=0&wr=0'), call("http://testdomain/path?wr=0&wa=1&wr=0"), call("http://testdomain/path?wr=0&wa=1&wr=0"), # call('http://testdomain/path?wr=1&wa=0&wr=0'), call("http://testdomain/path?wr=1&wa=1&wr=0"), call("http://testdomain/path?wr=1&wa=1&wr=0"), ]
def setup_crawler(): spider = doubanMovieSpider() settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start()
def runspider(): date = datetime.datetime.utcnow() unix_date = calendar.timegm(date.utctimetuple()) route = request.args.get('route') domain = request.args.get('domain') directory = r"{0}\initiator\static\scrapes\{1}\{2}".format(os.getcwd(), domain, unix_date) if not os.path.exists(directory): os.makedirs(directory) logfile = open('testlog.log', 'w') log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG) log_observer.start() log.start(loglevel=logging.DEBUG) dispatcher.connect(stop_reactor, signal=signals.spider_closed) spider = MySpider(route, unix_date) settings_module = importlib.import_module('SiteCrawler.settings') settings = CrawlerSettings(settings_module) crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start() log.msg('Running reactor...') reactor.run() # the script will block here until the spider is closed log.msg('Reactor stopped.') return redirect(url_for('choose_graph', domain = domain, date = unix_date))
def setUp(self): """Initialize the test.""" settings.LOG_LEVEL = 'DEBUG' crawler = Crawler(CrawlerSettings(settings)) crawler.configure() self.spider = ebird_spider.EBirdSpider('REG') self.spider.set_crawler(crawler)
def test_priorization(self): webdriver = Mock() settings = self.settings(WEBDRIVER_BROWSER=webdriver) webdriver.get.side_effect = self._wait webdriver.page_source = u'' dispatcher.connect(self._stop_reactor, signal=signals.spider_closed) crawler = Crawler(Settings(values=settings)) crawler.configure() spider = self.Spider(name='test', domain='testdomain') crawler.crawl(spider) crawler.start() log.start(loglevel='ERROR') reactor.run() assert webdriver.get.mock_calls == [ call('http://testdomain/path?wr=0'), call('http://testdomain/path?wr=0&wa=0'), call('http://testdomain/path?wr=0&wa=1'), call('http://testdomain/path?wr=1'), call('http://testdomain/path?wr=1&wa=0'), call('http://testdomain/path?wr=1&wa=1'), call('http://testdomain/path?wr=0&wa=0&wr=0'), call('http://testdomain/path?wr=0&wa=1&wr=0'), call('http://testdomain/path?wr=1&wa=0&wr=0'), call('http://testdomain/path?wr=1&wa=1&wr=0')]
def crawl(cls, sites): stat = {"spiders": 0} def soft_stop_reactor(): stat["spiders"] -= 1 if not stat["spiders"]: reactor.stop() for site in sites: try: spider = site.parser.spider(site) except (NotImplementedError, ObjectDoesNotExist): logger.error( _('Spider not implemented for "%s" site', site.label)) else: stat["spiders"] += 1 with spider_project(spider) as settings: crawler = Crawler(settings) crawler.signals.connect( soft_stop_reactor, signal=signals.spider_closed) # reactor.stop crawler.configure() crawler.crawl(spider) crawler.start() logfile = open('crawl.log', 'w') log_observer = log.ScrapyFileLogObserver(logfile, level=logging.INFO) log_observer.start() # the script will block here until the spider_closed signal was sent reactor.run()
def set_crawler(spider, receiver): settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(receiver.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start()
def call_spider(file): """ Crea el spider y ejecuta el reactor. Copia los resultados del crawling a los archivos .json para luego transformarlos a los archivos data.json correspondientes. """ with open(file, "r") as f: list_url = f.readlines() domains = [] urls = [] created_files = [] for u in list_url: domain = u.strip('\n') url_aux = domain.split("/") domain_type = False if (len(url_aux) > 1): domain = url_aux[0] url = "http://" + url_aux[0] + "/datos/data" if domain == 'www.paraguay.gov.py': url = "http://" + url_aux[0] + "/datos" else: url = "http://" + u.strip('\n') + "/data" domain_type = True print "============= Domain " + domain print "============= Start url " + url response = requests.get(url + "/data.json") if response.status_code == 200: filename = FileController.FileController( ).save_existing_data_json(response, domain, True) created_files.append({ 'modalidad': 'recolecta', 'archivo': filename }) else: domains.append(domain) urls.append(url) spider = DataSpider(domains=domains, start_urls=urls, domain_type=domain_type) settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() log.start(logfile="log.txt", loglevel=log.DEBUG, logstdout=False) reactor.run() # the script will block here """ Copiar los datos a los archivos .json """ data_spider.copy_items_to_files() """ Eliminar archivos temporales """ FileController.FileController().clean_tmp_files() """ Convertir los archivos .json a data.json (formato POD) """ for domain in domains: filename = DataJson.DataJson().convert(domain) created_files.append({ 'modalidad': 'data-hunting', 'archivo': filename }) return created_files
def setup_crawler(): spider = ScsSpider() settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start()
def handle(self, *args, **options): if (not len(args) == 1) or (args[0] == u"help"): self.stdout.write(u"Usage: {0}\n".format(self.args)) self.stdout.write(self.help) else: settings = get_project_settings() settings.overrides["URLS"] = args[0] crawler = Crawler(settings) spider = GeneralSpider() crawler.configure() crawler.crawl(spider) crawler.start() log.start_from_crawler(crawler) # stop the reactor once the spider has finished crawler.signals.connect(reactor.stop, signal=signals.spider_closed) try: log.msg("Running reactor...") reactor.run() except KeyboardInterrupt: stop_reactor() finally: log.msg("Reactor stopped") log.msg("#" * 40)
def setup_crawler(spider_name): exec("spider = " + spider_name) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start()
def _setup(self, project): spider = crawlspider.LinkSpider(project) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) self.add_crawler()
class Wallhaven_Crawler: def __init__(self, query): self.query = query # Creation of spider from query self.spider = WallhavenSpider(self.query) # Getting scrapy project settings self.settings = get_project_settings() # Creation of crawler from spider and scrapy project settings self.crawler = Crawler(self.settings) self.crawler.signals.connect(reactor.stop, signal = signals.spider_closed) self.crawler.configure() def start(self): # Crawling from spider self.crawler.crawl(self.spider) self.crawler.start() # Logging all process #log.start() #log.msg('Reactor activated.') # Execution of twisted reactor reactor.run() # The script will block here until the 'spider_closed' signal is sent
def setup_crawler(spider, stop=False): ''' Takes a spider class object ''' # Deferred means other functions can wait on this finishing # Wait until the callback is triggered by spider close # See twisted docs d = defer.Deferred() def foo(*a, **kw): # The result to be passed to any callbacks to deferred # (we don't use it, so True could've been False, None w/e) d.callback(True) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() # Ref to foo otherwise it gets GC'd (garbage collected) crawler._tempref = foo # foo is the handler for the closed signal from this spider # N.B. dispatch returns spider and reason (e.g. 'finished') to foo. crawler.signals.connect(foo, signal=signals.spider_closed) crawler.crawl(spider) # N.B log is scrapy log. log2 is python color logger # The crawler arg is necessary for log_count/{ERROR, DEBUG, INFO..} stats # which you will want for stats mailer extension. # Starting this each time will cause the big torrade of ESMTP Error # log.start(crawler=crawler) crawler.start() return d
def _crawl_next(self, spider): settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.signals.connect(self._done_task, signal=signals.spider_closed) crawler.crawl(spider) crawler.start()
def setUp(self): """Initialize the test.""" crawler = Crawler(CrawlerSettings(settings)) crawler.configure() self.spider = ebird_spider.EBirdSpider('REG') self.spider.set_crawler(crawler) self.requests = self.spider.start_requests()
def setup_crawler(): spider = DmmDirectSpider(url=sys.argv[1]) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start()
def handle(self, url_slug, **options): page = Page.objects.get(url_slug=url_slug) feed = page.feed store = page.store store_slug = store.slug.lower() opts = { 'recreate_tiles': options['recreate_tiles'], 'skip_images': not options['update_images'], 'skip_tiles': True, } start_urls = [] for tile in feed.tiles.all(): if tile.product: start_urls.append(tile.product.url) for content in tile.content.all(): for prod in content.tagged_products.all(): start_urls.append(prod.url) start_urls = set(start_urls) # set up standard framework for running spider in a script settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() spider = crawler.spiders.create(store_slug, **opts) spider.start_urls = start_urls spider.feed_id = feed.id crawler.crawl(spider) logging.info('Starting spider with options: {}'.format(opts)) crawler.start() reactor.run()
def setup_crawler(domain): spider = FollowAllSpider(domain=domain) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start()
def parse_careers(spider): crawler = Crawler(Settings()) crawler.configure() crawler.crawl(spider) crawler.start() log.start() spider.start()
def setup_crawler(domain, spidername): spider_class = globals()[spidername] spider = spider_class(domain=domain) crawler = Crawler(Settings()) crawler.configure() crawler.crawl(spider) crawler.start()
class CrawlerWorker(multiprocessing.Process): def __init__(self, spider, result_queue): multiprocessing.Process.__init__(self) self.result_queue = result_queue self.crawler = Crawler(Settings()) self.crawler.configure() self.items = [] self.spider = spider dispatcher.connect(self._item_passed, signals.item_passed) dispatcher.connect(self._stop_reactor, signal=signals.spider_closed) def _item_passed(self, item): self.items.append(item) def _stop_reactor(self): reactor.stop() def run(self): self.crawler.crawl(self.spider) self.crawler.start() reactor.run() self.result_queue.put(self.items)
def runSpider(args): spider = args[0] settings = args[1] crawler = Crawler(settings) crawler.signals.connect(stopCrawler, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start()
def call_spider(spider): settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() reactor.run()
def setup_crawler(spider): #spider = FollowAllSpider(domain=domain) settings = get_project_settings() crawler = Crawler(settings) spider = crawler.spiders.create(spider_name) crawler.configure() crawler.crawl(spider) crawler.start()
def crawl(): spider = StackserviceSpider() crawler = Crawler(Settings()) crawler.configure() crawler.crawl(spider) crawler.start() log.start() reactor.run() # the script will block here
def crawl(): crawler = Crawler(settings) spider = MySpider() crawler.signals.connect(callback, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() reactor.run()
def setup_crawler(domain): spider = MovieSpider() settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start()
def setup_crawler(stuff): spider = MySpider(stuff=stuff) settings = Settings() #settings.setdict(env_overrides, priority='project') crawler = Crawler(settings) crawler.signals.connect(crawlstack, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start()
def run(self): crawler = Crawler(get_project_settings()) crawler.configure() log.start() for spiderName in crawler.spiders.list(): self.spiderCounter += 1 self.setupCrawler(spiderName) reactor.run()
def setup_crawler(): crawler = Crawler(settings) crawler.configure() crawler.signals.connect(reactor_control.remove_crawler, signal=signals.spider_closed) spider = AutoRobot_Prenium() crawler.crawl(spider) reactor_control.add_crawler() crawler.start()
def setUp(self): """Initialize the test.""" crawler = Crawler(CrawlerSettings(settings)) crawler.configure() self.spider = ebird_spider.EBirdSpider('REG') self.spider.set_crawler(crawler) self.spider.start_requests() self.records = [{ 'checklistID': 'CL00001', 'comName': 'Common Name', 'countryCode': 'CC', 'countryName': 'Country', 'firstName': 'Name', 'howMany': 1, 'lastName': 'Surname', 'lat': 45.000000, 'lng': -45.000000, 'locID': 'L0000001', 'locName': 'Location 1', 'locationPrivate': True, 'obsDt': '2013-03-27 09:00', 'obsID': 'OBS0000001', 'obsReviewed': False, 'obsValid': True, 'presenceNoted': False, 'sciName': 'Scientific Name', 'subID': 'S0000001', 'subnational1Code': 'SN-01', 'subnational1Name': 'Region', 'subnational2Code': 'SN-02', 'subnational2Name': 'County', }, { 'checklistID': 'CL00002', 'comName': 'Common Name', 'countryCode': 'CC', 'countryName': 'Country', 'firstName': 'Name', 'howMany': 1, 'lastName': 'Surname', 'lat': 50.000000, 'lng': -50.000000, 'locID': 'L0000002', 'locName': 'Location 2', 'locationPrivate': True, 'obsDt': '2013-03-27 10:00', 'obsID': 'OBS0000002', 'obsReviewed': False, 'obsValid': True, 'presenceNoted': False, 'sciName': 'Scientific Name', 'subID': 'S0000002', 'subnational1Code': 'SN-01', 'subnational1Name': 'Region', 'subnational2Code': 'SN-02', 'subnational2Name': 'County', }]
def run(self, args, otps): setting = get_project_settings() for spider_name in self.crawler.spiders.list(): craw = Crawler(settings) craw.configure() spider = craw.spiders.create(spideer_name) craw.crawl(spider) craw.start() self.craw.start()
def config_spider(self, spid, spider): """The boring startup routine""" proj_settings = get_project_settings() crawler = Crawler(proj_settings) self._ids_to_crawlers_map[spid] = {"spider":spider, "crawler":crawler} # connect each spider's closed signal to self. When all spiders done, stop the reactor crawler.signals.connect(self.spider_closed, signal=signals.spider_closed) # i do not really now if that is appended or overwritten crawler.configure() crawler.crawl(spider) crawler.start()
def get_more_entropy(): spider = TruenetSpider(domain='truenet.co.nz') settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() log.start() reactor.run()
def do_scrape(spider_name): """ Asynchronous task for individual scrapes that is executed by Celery workers. :param spider_name: str name of the spider that should be run :return: the full path of the jsonlines output file to which results are stored """ # create and configure the spider crawl_settings = get_project_settings() # configure the output # Technically don't need this unless we actually do the scrape, but need to put # up here before the crawler is instantiated so the FEED_URI override is active output_name = generate_scrape_name(spider_name) output_path = os.path.join(crawl_settings.get('DATA_DIR_BASE'), 'scrapes', output_name) crawl_settings.overrides['FEED_URI'] = output_path crawler = Crawler(crawl_settings) crawler.configure() try: spider = crawler.spiders.create(spider_name) except KeyError as e: # No spider found. raise RuntimeError( 'Could not find spider with name {}'.format(spider_name)) # Check to see if we're already running a scrape by looking for open ScrapeJobs is_scraping = is_spider_scraping(spider_name) if is_scraping is False: logger.info('Starting new scrape of {}'.format(spider_name)) # Create the ScrapeJob record job_id = do_scrape.request.id if job_id is None: # Case if called directly without using Celery, put in a dummy job id timestamp = datetime.now().strftime('%y%m%d%H%M') job_id = 'MANUAL_RUN{}'.format(timestamp) job = ScrapeJob.objects.create( spider=spider_name, scheduled=datetime.now(), # see http://stackoverflow.com/questions/18872854/getting-task-id-inside-a-celery-task job_id=job_id, raw_response=output_path) # and set up the callback for updating it complete_cb = complete_job(job.id) # Connect the signals and logging, then start it up crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.signals.connect(complete_cb, signal=signals.spider_closed) log.start(loglevel=log.INFO, logstdout=True) crawler.crawl(spider) crawler.start() reactor.run() else: logger.info('Pending job found for spider {}'.format(spider_name)) job = is_scraping return job.raw_response