def setUp(self): if os.path.exists(self.IMG_DIR): shutil.rmtree(self.IMG_DIR) os.mkdir(self.IMG_DIR) settings.set('ITEM_PIPELINES', self.dds_settings['ITEM_PIPELINES'], priority='cmdline') settings.set('SPLASH_URL', self.dds_settings['SPLASH_URL'], priority='cmdline') settings.set('DUPEFILTER_CLASS', self.dds_settings['DUPEFILTER_CLASS'], priority='cmdline') settings.set('DOWNLOADER_MIDDLEWARES', self.dds_settings['DOWNLOADER_MIDDLEWARES'], priority='cmdline') settings.set('IMAGES_STORE', self.dds_settings['IMAGES_STORE'], priority='cmdline') if 'IMAGES_THUMBS' in self.dds_settings: settings.set('IMAGES_THUMBS', self.dds_settings['IMAGES_THUMBS'], priority='cmdline') if 'DSCRAPER_IMAGES_STORE_FORMAT' in self.dds_settings: settings.set('DSCRAPER_IMAGES_STORE_FORMAT', self.dds_settings['DSCRAPER_IMAGES_STORE_FORMAT'], priority='cmdline') settings.set('COOKIES_DEBUG', True) self.crawler = Crawler(settings) self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed) self.crawler.configure() self.sc = ScrapedObjClass(name='Event') self.sc.save() self.soa_base = ScrapedObjAttr(name=u'base', attr_type='B', obj_class=self.sc) self.soa_base.save() self.soa_title = ScrapedObjAttr(name=u'title', attr_type='S', obj_class=self.sc) self.soa_title.save() self.soa_url = ScrapedObjAttr(name=u'url', attr_type='U', obj_class=self.sc, id_field=True) self.soa_url.save() self.soa_desc = ScrapedObjAttr(name=u'description', attr_type='S', obj_class=self.sc) self.soa_desc.save() self.scraper = Scraper( name=u'Event Scraper', scraped_obj_class=self.sc, status='A', ) self.scraper.save() self.se_base = ScraperElem(scraped_obj_attr=self.soa_base, scraper=self.scraper, x_path=u'//ul/li', from_detail_page=False) self.se_base.save() self.se_title = ScraperElem(scraped_obj_attr=self.soa_title, scraper=self.scraper, x_path=u'a/text()', from_detail_page=False) self.se_title.save() self.se_url = ScraperElem(scraped_obj_attr=self.soa_url, scraper=self.scraper, x_path=u'a/@href', from_detail_page=False) self.se_url.save() self.se_desc = ScraperElem( scraped_obj_attr=self.soa_desc, scraper=self.scraper, x_path=u'//div/div[@class="description"]/text()', from_detail_page=True, mandatory=False) self.se_desc.save() self.sched_rt = SchedulerRuntime() self.sched_rt.save() self.event_website = EventWebsite( pk=1, name=u'Event Website', scraper=self.scraper, url=os.path.join(self.SERVER_URL, 'site_generic/event_main.html'), scraper_runtime=self.sched_rt, ) self.event_website.save() for name, signal in vars(signals).items(): if not name.startswith('_'): dispatcher.connect(self.record_signal, signal)
def setUp(self): self.sc = ScrapedObjClass(name='Event') self.sc.save() self.soa_base = ScrapedObjAttr(name=u'base', attr_type='B', obj_class=self.sc) self.soa_base.save() self.soa_title = ScrapedObjAttr(name=u'title', attr_type='S', obj_class=self.sc) self.soa_title.save() self.soa_url = ScrapedObjAttr(name=u'url', attr_type='U', obj_class=self.sc) self.soa_url.save() self.soa_desc = ScrapedObjAttr(name=u'description', attr_type='S', obj_class=self.sc) self.soa_desc.save() self.scraper = Scraper( name=u'Event Scraper', scraped_obj_class=self.sc, status='A', ) self.scraper.save() self.se_base = ScraperElem(scraped_obj_attr=self.soa_base, scraper=self.scraper, x_path=u'//ul/li', from_detail_page=False) self.se_base.save() self.se_title = ScraperElem(scraped_obj_attr=self.soa_title, scraper=self.scraper, x_path=u'a/text()', from_detail_page=False) self.se_title.save() self.se_url = ScraperElem(scraped_obj_attr=self.soa_url, scraper=self.scraper, x_path=u'a/@href', from_detail_page=False) self.se_url.save() self.se_desc = ScraperElem( scraped_obj_attr=self.soa_desc, scraper=self.scraper, x_path=u'//div/div[@class="description"]/text()', from_detail_page=True, mandatory=False) self.se_desc.save() self.sched_rt = SchedulerRuntime() self.sched_rt.save() self.event_website = EventWebsite( pk=1, name=u'Event Website', scraper=self.scraper, url=os.path.join(self.SERVER_URL, 'site_generic/event_main.html'), scraper_runtime=self.sched_rt, ) self.event_website.save() settings.overrides['ITEM_PIPELINES'] = [ 'dynamic_scraper.pipelines.DjangoImagesPipeline', 'dynamic_scraper.pipelines.ValidationPipeline', 'scraper.scraper_test.DjangoWriterPipeline', ] settings.overrides['IMAGES_STORE'] = os.path.join( self.PROJECT_ROOT, 'imgs') settings.overrides['IMAGES_THUMBS'] = { 'small': (170, 170), } self.crawler = CrawlerProcess(settings) self.crawler.install() self.crawler.configure() for name, signal in vars(signals).items(): if not name.startswith('_'): dispatcher.connect(self.record_signal, signal)
def setUp(self): if os.path.exists(self.IMG_DIR): shutil.rmtree(self.IMG_DIR) os.mkdir(self.IMG_DIR) settings.set('ITEM_PIPELINES', self.dds_settings['ITEM_PIPELINES'], priority='cmdline') settings.set('SPLASH_URL', self.dds_settings['SPLASH_URL'], priority='cmdline') settings.set('DUPEFILTER_CLASS', self.dds_settings['DUPEFILTER_CLASS'], priority='cmdline') settings.set('DOWNLOADER_MIDDLEWARES', self.dds_settings['DOWNLOADER_MIDDLEWARES'], priority='cmdline') settings.set('IMAGES_STORE', self.dds_settings['IMAGES_STORE'], priority='cmdline') if 'IMAGES_THUMBS' in self.dds_settings: settings.set('IMAGES_THUMBS', self.dds_settings['IMAGES_THUMBS'], priority='cmdline') if 'DSCRAPER_IMAGES_STORE_FORMAT' in self.dds_settings: settings.set('DSCRAPER_IMAGES_STORE_FORMAT', self.dds_settings['DSCRAPER_IMAGES_STORE_FORMAT'], priority='cmdline') settings.set('COOKIES_DEBUG', True) settings.set('LOG_LEVEL', 'DEBUG') settings.set('LOG_ENABLED', False) #self.crawler = Crawler(settings) #self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed) #self.crawler.configure() self.process = CrawlerProcess(settings) self.sc = ScrapedObjClass(name='Event') self.sc.save() self.soa_base = ScrapedObjAttr(name='base', attr_type='B', obj_class=self.sc) self.soa_base.save() self.soa_title = ScrapedObjAttr(name='title', attr_type='S', obj_class=self.sc) self.soa_title.save() self.soa_url = ScrapedObjAttr(name='url', attr_type='U', obj_class=self.sc, id_field=True) self.soa_url.save() self.soa_url2 = ScrapedObjAttr(name='url2', attr_type='U', obj_class=self.sc) self.soa_url2.save() self.soa_desc = ScrapedObjAttr(name='description', attr_type='S', obj_class=self.sc) self.soa_desc.save() self.soa_desc2 = ScrapedObjAttr(name='description2', attr_type='S', obj_class=self.sc) self.soa_desc2.save() self.soa_es_1 = ScrapedObjAttr(name='extra_standard_1', attr_type='S', obj_class=self.sc, save_to_db=False) self.soa_es_1.save() self.scraper = Scraper( name='Event Scraper', scraped_obj_class=self.sc, status='A', ) self.scraper.save() self.se_base = ScraperElem(scraped_obj_attr=self.soa_base, scraper=self.scraper, x_path='//ul/li', request_page_type='MP') self.se_base.save() self.se_title = ScraperElem(scraped_obj_attr=self.soa_title, scraper=self.scraper, x_path='a/text()', request_page_type='MP') self.se_title.save() self.se_url = ScraperElem(scraped_obj_attr=self.soa_url, scraper=self.scraper, x_path='a/@href', request_page_type='MP') self.se_url.save() self.se_desc = ScraperElem( scraped_obj_attr=self.soa_desc, scraper=self.scraper, x_path='//div/div[@class="description"]/text()', request_page_type='DP1', mandatory=False) self.se_desc.save() self.se_es_1 = ScraperElem(scraped_obj_attr=self.soa_es_1, scraper=self.scraper, x_path='a/text()', request_page_type='MP') self.se_es_1.save() self.rpt_mp = RequestPageType(page_type='MP', scraper=self.scraper) self.rpt_mp.save() self.rpt_dp1 = RequestPageType(page_type='DP1', scraper=self.scraper, scraped_obj_attr=self.soa_url) self.rpt_dp1.save() self.sched_rt = SchedulerRuntime() self.sched_rt.save() self.event_website = EventWebsite( pk=1, name='Event Website', scraper=self.scraper, url=os.path.join(self.SERVER_URL, 'site_generic/event_main.html'), scraper_runtime=self.sched_rt, ) self.event_website.save() for name, signal in list(vars(signals).items()): if not name.startswith('_'): dispatcher.connect(self.record_signal, signal)