def setUp(self): if os.path.exists(self.IMG_DIR): shutil.rmtree(self.IMG_DIR) os.mkdir(self.IMG_DIR) settings.set('ITEM_PIPELINES', self.dds_settings['ITEM_PIPELINES'], priority='cmdline') settings.set('SPLASH_URL', self.dds_settings['SPLASH_URL'], priority='cmdline') settings.set('DUPEFILTER_CLASS', self.dds_settings['DUPEFILTER_CLASS'], priority='cmdline') settings.set('DOWNLOADER_MIDDLEWARES', self.dds_settings['DOWNLOADER_MIDDLEWARES'], priority='cmdline') settings.set('IMAGES_STORE', self.dds_settings['IMAGES_STORE'], priority='cmdline') if 'IMAGES_THUMBS' in self.dds_settings: settings.set('IMAGES_THUMBS', self.dds_settings['IMAGES_THUMBS'], priority='cmdline') if 'DSCRAPER_IMAGES_STORE_FORMAT' in self.dds_settings: settings.set('DSCRAPER_IMAGES_STORE_FORMAT', self.dds_settings['DSCRAPER_IMAGES_STORE_FORMAT'], priority='cmdline') self.crawler = Crawler(settings) self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed) self.crawler.configure() self.sc = ScrapedObjClass(name='Event') self.sc.save() self.soa_base = ScrapedObjAttr(name=u'base', attr_type='B', obj_class=self.sc) self.soa_base.save() self.soa_title = ScrapedObjAttr(name=u'title', attr_type='S', obj_class=self.sc) self.soa_title.save() self.soa_url = ScrapedObjAttr(name=u'url', attr_type='U', obj_class=self.sc, id_field=True) self.soa_url.save() self.soa_desc = ScrapedObjAttr(name=u'description', attr_type='S', obj_class=self.sc) self.soa_desc.save() self.scraper = Scraper(name=u'Event Scraper', scraped_obj_class=self.sc, status='A',) self.scraper.save() self.se_base = ScraperElem(scraped_obj_attr=self.soa_base, scraper=self.scraper, x_path=u'//ul/li', from_detail_page=False) self.se_base.save() self.se_title = ScraperElem(scraped_obj_attr=self.soa_title, scraper=self.scraper, x_path=u'a/text()', from_detail_page=False) self.se_title.save() self.se_url = ScraperElem(scraped_obj_attr=self.soa_url, scraper=self.scraper, x_path=u'a/@href', from_detail_page=False) self.se_url.save() self.se_desc = ScraperElem(scraped_obj_attr=self.soa_desc, scraper=self.scraper, x_path=u'//div/div[@class="description"]/text()', from_detail_page=True, mandatory=False) self.se_desc.save() self.sched_rt = SchedulerRuntime() self.sched_rt.save() self.event_website = EventWebsite(pk=1, name=u'Event Website', scraper=self.scraper, url=os.path.join(self.SERVER_URL, 'site_generic/event_main.html'), scraper_runtime=self.sched_rt,) self.event_website.save() for name, signal in vars(signals).items(): if not name.startswith('_'): dispatcher.connect(self.record_signal, signal)
def setUp(self): self.sc = ScrapedObjClass(name='Event') self.sc.save() self.soa_base = ScrapedObjAttr(name=u'base', attr_type='B', obj_class=self.sc) self.soa_base.save() self.soa_title = ScrapedObjAttr(name=u'title', attr_type='S', obj_class=self.sc) self.soa_title.save() self.soa_url = ScrapedObjAttr(name=u'url', attr_type='U', obj_class=self.sc) self.soa_url.save() self.soa_desc = ScrapedObjAttr(name=u'description', attr_type='S', obj_class=self.sc) self.soa_desc.save() self.scraper = Scraper(name=u'Event Scraper', scraped_obj_class=self.sc) self.scraper.save() self.se_base = ScraperElem(scraped_obj_attr=self.soa_base, scraper=self.scraper, x_path=u'//ul/li', from_detail_page=False) self.se_base.save() self.se_title = ScraperElem(scraped_obj_attr=self.soa_title, scraper=self.scraper, x_path=u'a/text()', from_detail_page=False) self.se_title.save() self.se_url = ScraperElem(scraped_obj_attr=self.soa_url, scraper=self.scraper, x_path=u'a/@href', from_detail_page=False) self.se_url.save() self.se_desc = ScraperElem(scraped_obj_attr=self.soa_desc, scraper=self.scraper, x_path=u'//div/div[@class="description"]/text()', from_detail_page=True, mandatory=False) self.se_desc.save() self.sched_rt = SchedulerRuntime() self.sched_rt.save() self.scraper_rt = ScraperRuntime(name=u'Events Runtime', scraper=self.scraper, url=os.path.join(self.SERVER_URL, 'site_generic/event_main.html'), status='A', scheduler_runtime=self.sched_rt) self.scraper_rt.save() self.event_website = EventWebsite(pk=1, name=u'Event Website', scraper_runtime=self.scraper_rt) self.event_website.save() settings.overrides['ITEM_PIPELINES'] = [ 'dynamic_scraper.pipelines.DjangoImagesPipeline', 'dynamic_scraper.pipelines.ValidationPipeline', 'scraper.scraper_test.DjangoWriterPipeline', ] settings.overrides['IMAGES_STORE'] = os.path.join(self.PROJECT_ROOT, 'imgs') settings.overrides['IMAGES_THUMBS'] = { 'small': (170, 170), } self.crawler = CrawlerProcess(settings) self.crawler.install() self.crawler.configure() for name, signal in vars(signals).items(): if not name.startswith('_'): dispatcher.connect(self.record_signal, signal)
def setUp(self): self.sc = ScrapedObjClass(name='Event') self.sc.save() self.soa_base = ScrapedObjAttr(name=u'base', attr_type='B', obj_class=self.sc) self.soa_base.save() self.soa_title = ScrapedObjAttr(name=u'title', attr_type='S', obj_class=self.sc) self.soa_title.save() self.soa_url = ScrapedObjAttr(name=u'url', attr_type='U', obj_class=self.sc) self.soa_url.save() self.soa_desc = ScrapedObjAttr(name=u'description', attr_type='S', obj_class=self.sc) self.soa_desc.save() self.scraper = Scraper( name=u'Event Scraper', scraped_obj_class=self.sc, status='A', ) self.scraper.save() self.se_base = ScraperElem(scraped_obj_attr=self.soa_base, scraper=self.scraper, x_path=u'//ul/li', from_detail_page=False) self.se_base.save() self.se_title = ScraperElem(scraped_obj_attr=self.soa_title, scraper=self.scraper, x_path=u'a/text()', from_detail_page=False) self.se_title.save() self.se_url = ScraperElem(scraped_obj_attr=self.soa_url, scraper=self.scraper, x_path=u'a/@href', from_detail_page=False) self.se_url.save() self.se_desc = ScraperElem( scraped_obj_attr=self.soa_desc, scraper=self.scraper, x_path=u'//div/div[@class="description"]/text()', from_detail_page=True, mandatory=False) self.se_desc.save() self.sched_rt = SchedulerRuntime() self.sched_rt.save() self.event_website = EventWebsite( pk=1, name=u'Event Website', scraper=self.scraper, url=os.path.join(self.SERVER_URL, 'site_generic/event_main.html'), scraper_runtime=self.sched_rt, ) self.event_website.save() settings.overrides['ITEM_PIPELINES'] = [ 'dynamic_scraper.pipelines.DjangoImagesPipeline', 'dynamic_scraper.pipelines.ValidationPipeline', 'scraper.scraper_test.DjangoWriterPipeline', ] settings.overrides['IMAGES_STORE'] = os.path.join( self.PROJECT_ROOT, 'imgs') settings.overrides['IMAGES_THUMBS'] = { 'small': (170, 170), } self.crawler = CrawlerProcess(settings) self.crawler.install() self.crawler.configure() for name, signal in vars(signals).items(): if not name.startswith('_'): dispatcher.connect(self.record_signal, signal)
class ScraperTest(TestCase): SERVER_URL = 'http://*****:*****@href', from_detail_page=False) self.se_url.save() self.se_desc = ScraperElem( scraped_obj_attr=self.soa_desc, scraper=self.scraper, x_path=u'//div/div[@class="description"]/text()', from_detail_page=True, mandatory=False) self.se_desc.save() self.sched_rt = SchedulerRuntime() self.sched_rt.save() self.event_website = EventWebsite( pk=1, name=u'Event Website', scraper=self.scraper, url=os.path.join(self.SERVER_URL, 'site_generic/event_main.html'), scraper_runtime=self.sched_rt, ) self.event_website.save() settings.overrides['ITEM_PIPELINES'] = [ 'dynamic_scraper.pipelines.DjangoImagesPipeline', 'dynamic_scraper.pipelines.ValidationPipeline', 'scraper.scraper_test.DjangoWriterPipeline', ] settings.overrides['IMAGES_STORE'] = os.path.join( self.PROJECT_ROOT, 'imgs') settings.overrides['IMAGES_THUMBS'] = { 'small': (170, 170), } self.crawler = CrawlerProcess(settings) self.crawler.install() self.crawler.configure() for name, signal in vars(signals).items(): if not name.startswith('_'): dispatcher.connect(self.record_signal, signal) def tearDown(self): pass
class ScraperTest(TestCase): SERVER_URL = 'http://*****:*****@href', from_detail_page=False) self.se_url.save() self.se_desc = ScraperElem( scraped_obj_attr=self.soa_desc, scraper=self.scraper, x_path=u'//div/div[@class="description"]/text()', from_detail_page=True, mandatory=False) self.se_desc.save() self.sched_rt = SchedulerRuntime() self.sched_rt.save() self.event_website = EventWebsite( pk=1, name=u'Event Website', scraper=self.scraper, url=os.path.join(self.SERVER_URL, 'site_generic/event_main.html'), scraper_runtime=self.sched_rt, ) self.event_website.save() for name, signal in vars(signals).items(): if not name.startswith('_'): dispatcher.connect(self.record_signal, signal) def tearDown(self): pass
def setUp(self): if os.path.exists(self.IMG_DIR): shutil.rmtree(self.IMG_DIR) os.mkdir(self.IMG_DIR) settings.set('ITEM_PIPELINES', self.dds_settings['ITEM_PIPELINES'], priority='cmdline') settings.set('SPLASH_URL', self.dds_settings['SPLASH_URL'], priority='cmdline') settings.set('DUPEFILTER_CLASS', self.dds_settings['DUPEFILTER_CLASS'], priority='cmdline') settings.set('DOWNLOADER_MIDDLEWARES', self.dds_settings['DOWNLOADER_MIDDLEWARES'], priority='cmdline') settings.set('IMAGES_STORE', self.dds_settings['IMAGES_STORE'], priority='cmdline') if 'IMAGES_THUMBS' in self.dds_settings: settings.set('IMAGES_THUMBS', self.dds_settings['IMAGES_THUMBS'], priority='cmdline') if 'DSCRAPER_IMAGES_STORE_FORMAT' in self.dds_settings: settings.set('DSCRAPER_IMAGES_STORE_FORMAT', self.dds_settings['DSCRAPER_IMAGES_STORE_FORMAT'], priority='cmdline') settings.set('COOKIES_DEBUG', True) self.crawler = Crawler(settings) self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed) self.crawler.configure() self.sc = ScrapedObjClass(name='Event') self.sc.save() self.soa_base = ScrapedObjAttr(name=u'base', attr_type='B', obj_class=self.sc) self.soa_base.save() self.soa_title = ScrapedObjAttr(name=u'title', attr_type='S', obj_class=self.sc) self.soa_title.save() self.soa_url = ScrapedObjAttr(name=u'url', attr_type='U', obj_class=self.sc, id_field=True) self.soa_url.save() self.soa_desc = ScrapedObjAttr(name=u'description', attr_type='S', obj_class=self.sc) self.soa_desc.save() self.scraper = Scraper( name=u'Event Scraper', scraped_obj_class=self.sc, status='A', ) self.scraper.save() self.se_base = ScraperElem(scraped_obj_attr=self.soa_base, scraper=self.scraper, x_path=u'//ul/li', from_detail_page=False) self.se_base.save() self.se_title = ScraperElem(scraped_obj_attr=self.soa_title, scraper=self.scraper, x_path=u'a/text()', from_detail_page=False) self.se_title.save() self.se_url = ScraperElem(scraped_obj_attr=self.soa_url, scraper=self.scraper, x_path=u'a/@href', from_detail_page=False) self.se_url.save() self.se_desc = ScraperElem( scraped_obj_attr=self.soa_desc, scraper=self.scraper, x_path=u'//div/div[@class="description"]/text()', from_detail_page=True, mandatory=False) self.se_desc.save() self.sched_rt = SchedulerRuntime() self.sched_rt.save() self.event_website = EventWebsite( pk=1, name=u'Event Website', scraper=self.scraper, url=os.path.join(self.SERVER_URL, 'site_generic/event_main.html'), scraper_runtime=self.sched_rt, ) self.event_website.save() for name, signal in vars(signals).items(): if not name.startswith('_'): dispatcher.connect(self.record_signal, signal)
class ScraperTest(TestCase): SERVER_URL = 'http://*****:*****@href', from_detail_page=False) self.se_url.save() self.se_desc = ScraperElem(scraped_obj_attr=self.soa_desc, scraper=self.scraper, x_path=u'//div/div[@class="description"]/text()', from_detail_page=True, mandatory=False) self.se_desc.save() self.sched_rt = SchedulerRuntime() self.sched_rt.save() self.event_website = EventWebsite(pk=1, name=u'Event Website', scraper=self.scraper, url=os.path.join(self.SERVER_URL, 'site_generic/event_main.html'), scraper_runtime=self.sched_rt,) self.event_website.save() settings.overrides['ITEM_PIPELINES'] = [ 'dynamic_scraper.pipelines.DjangoImagesPipeline', 'dynamic_scraper.pipelines.ValidationPipeline', 'scraper.scraper_test.DjangoWriterPipeline', ] settings.overrides['IMAGES_STORE'] = os.path.join(self.PROJECT_ROOT, 'imgs') settings.overrides['IMAGES_THUMBS'] = { 'small': (170, 170), } self.crawler = CrawlerProcess(settings) self.crawler.install() self.crawler.configure() for name, signal in vars(signals).items(): if not name.startswith('_'): dispatcher.connect(self.record_signal, signal) def tearDown(self): pass
class ScraperTest(TestCase): SERVER_URL = "http://*****:*****@href", request_page_type="MP" ) self.se_url.save() self.se_desc = ScraperElem( scraped_obj_attr=self.soa_desc, scraper=self.scraper, x_path='//div/div[@class="description"]/text()', request_page_type="DP1", mandatory=False, ) self.se_desc.save() self.se_es_1 = ScraperElem( scraped_obj_attr=self.soa_es_1, scraper=self.scraper, x_path="a/text()", request_page_type="MP" ) self.se_es_1.save() self.rpt_mp = RequestPageType(page_type="MP", scraper=self.scraper) self.rpt_mp.save() self.rpt_dp1 = RequestPageType(page_type="DP1", scraper=self.scraper, scraped_obj_attr=self.soa_url) self.rpt_dp1.save() self.sched_rt = SchedulerRuntime() self.sched_rt.save() self.event_website = EventWebsite( pk=1, name="Event Website", scraper=self.scraper, url=os.path.join(self.SERVER_URL, "site_generic/event_main.html"), scraper_runtime=self.sched_rt, ) self.event_website.save() for name, signal in list(vars(signals).items()): if not name.startswith("_"): dispatcher.connect(self.record_signal, signal) def tearDown(self): self.event_website.delete() Event.objects.all().delete()
def setUp(self): if os.path.exists(self.IMG_DIR): shutil.rmtree(self.IMG_DIR) os.mkdir(self.IMG_DIR) settings.set("ITEM_PIPELINES", self.dds_settings["ITEM_PIPELINES"], priority="cmdline") settings.set("SPLASH_URL", self.dds_settings["SPLASH_URL"], priority="cmdline") settings.set("DUPEFILTER_CLASS", self.dds_settings["DUPEFILTER_CLASS"], priority="cmdline") settings.set("DOWNLOADER_MIDDLEWARES", self.dds_settings["DOWNLOADER_MIDDLEWARES"], priority="cmdline") settings.set("IMAGES_STORE", self.dds_settings["IMAGES_STORE"], priority="cmdline") if "IMAGES_THUMBS" in self.dds_settings: settings.set("IMAGES_THUMBS", self.dds_settings["IMAGES_THUMBS"], priority="cmdline") if "DSCRAPER_IMAGES_STORE_FORMAT" in self.dds_settings: settings.set( "DSCRAPER_IMAGES_STORE_FORMAT", self.dds_settings["DSCRAPER_IMAGES_STORE_FORMAT"], priority="cmdline" ) settings.set("COOKIES_DEBUG", True) settings.set("LOG_LEVEL", "DEBUG") settings.set("LOG_ENABLED", False) # self.crawler = Crawler(settings) # self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed) # self.crawler.configure() self.process = CrawlerProcess(settings) self.sc = ScrapedObjClass(name="Event") self.sc.save() self.soa_base = ScrapedObjAttr(name="base", attr_type="B", obj_class=self.sc) self.soa_base.save() self.soa_title = ScrapedObjAttr(name="title", attr_type="S", obj_class=self.sc) self.soa_title.save() self.soa_url = ScrapedObjAttr(name="url", attr_type="U", obj_class=self.sc, id_field=True) self.soa_url.save() self.soa_url2 = ScrapedObjAttr(name="url2", attr_type="U", obj_class=self.sc) self.soa_url2.save() self.soa_desc = ScrapedObjAttr(name="description", attr_type="S", obj_class=self.sc) self.soa_desc.save() self.soa_desc2 = ScrapedObjAttr(name="description2", attr_type="S", obj_class=self.sc) self.soa_desc2.save() self.soa_es_1 = ScrapedObjAttr(name="extra_standard_1", attr_type="S", obj_class=self.sc, save_to_db=False) self.soa_es_1.save() self.scraper = Scraper(name="Event Scraper", scraped_obj_class=self.sc, status="A") self.scraper.save() self.se_base = ScraperElem( scraped_obj_attr=self.soa_base, scraper=self.scraper, x_path="//ul/li", request_page_type="MP" ) self.se_base.save() self.se_title = ScraperElem( scraped_obj_attr=self.soa_title, scraper=self.scraper, x_path="a/text()", request_page_type="MP" ) self.se_title.save() self.se_url = ScraperElem( scraped_obj_attr=self.soa_url, scraper=self.scraper, x_path="a/@href", request_page_type="MP" ) self.se_url.save() self.se_desc = ScraperElem( scraped_obj_attr=self.soa_desc, scraper=self.scraper, x_path='//div/div[@class="description"]/text()', request_page_type="DP1", mandatory=False, ) self.se_desc.save() self.se_es_1 = ScraperElem( scraped_obj_attr=self.soa_es_1, scraper=self.scraper, x_path="a/text()", request_page_type="MP" ) self.se_es_1.save() self.rpt_mp = RequestPageType(page_type="MP", scraper=self.scraper) self.rpt_mp.save() self.rpt_dp1 = RequestPageType(page_type="DP1", scraper=self.scraper, scraped_obj_attr=self.soa_url) self.rpt_dp1.save() self.sched_rt = SchedulerRuntime() self.sched_rt.save() self.event_website = EventWebsite( pk=1, name="Event Website", scraper=self.scraper, url=os.path.join(self.SERVER_URL, "site_generic/event_main.html"), scraper_runtime=self.sched_rt, ) self.event_website.save() for name, signal in list(vars(signals).items()): if not name.startswith("_"): dispatcher.connect(self.record_signal, signal)
class ScraperTest(TestCase): SERVER_URL = 'http://*****:*****@href', request_page_type='MP') self.se_url.save() self.se_desc = ScraperElem(scraped_obj_attr=self.soa_desc, scraper=self.scraper, x_path=u'//div/div[@class="description"]/text()', request_page_type='DP1', mandatory=False) self.se_desc.save() self.rpt_mp = RequestPageType(page_type='MP', scraper=self.scraper) self.rpt_mp.save() self.rpt_dp1 = RequestPageType(page_type='DP1', scraper=self.scraper, scraped_obj_attr=self.soa_url) self.rpt_dp1.save() self.sched_rt = SchedulerRuntime() self.sched_rt.save() self.event_website = EventWebsite(pk=1, name=u'Event Website', scraper=self.scraper, url=os.path.join(self.SERVER_URL, 'site_generic/event_main.html'), scraper_runtime=self.sched_rt,) self.event_website.save() for name, signal in vars(signals).items(): if not name.startswith('_'): dispatcher.connect(self.record_signal, signal) def tearDown(self): pass
def setUp(self): if os.path.exists(self.IMG_DIR): shutil.rmtree(self.IMG_DIR) os.mkdir(self.IMG_DIR) settings.set('ITEM_PIPELINES', self.dds_settings['ITEM_PIPELINES'], priority='cmdline') settings.set('SPLASH_URL', self.dds_settings['SPLASH_URL'], priority='cmdline') settings.set('DUPEFILTER_CLASS', self.dds_settings['DUPEFILTER_CLASS'], priority='cmdline') settings.set('DOWNLOADER_MIDDLEWARES', self.dds_settings['DOWNLOADER_MIDDLEWARES'], priority='cmdline') settings.set('IMAGES_STORE', self.dds_settings['IMAGES_STORE'], priority='cmdline') if 'IMAGES_THUMBS' in self.dds_settings: settings.set('IMAGES_THUMBS', self.dds_settings['IMAGES_THUMBS'], priority='cmdline') if 'DSCRAPER_IMAGES_STORE_FORMAT' in self.dds_settings: settings.set('DSCRAPER_IMAGES_STORE_FORMAT', self.dds_settings['DSCRAPER_IMAGES_STORE_FORMAT'], priority='cmdline') settings.set('COOKIES_DEBUG', True) settings.set('LOG_LEVEL', 'DEBUG') settings.set('LOG_ENABLED', False) #self.crawler = Crawler(settings) #self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed) #self.crawler.configure() self.process = CrawlerProcess(settings) self.sc = ScrapedObjClass(name='Event') self.sc.save() self.soa_base = ScrapedObjAttr(name='base', attr_type='B', obj_class=self.sc) self.soa_base.save() self.soa_title = ScrapedObjAttr(name='title', attr_type='S', obj_class=self.sc) self.soa_title.save() self.soa_url = ScrapedObjAttr(name='url', attr_type='U', obj_class=self.sc, id_field=True) self.soa_url.save() self.soa_url2 = ScrapedObjAttr(name='url2', attr_type='U', obj_class=self.sc) self.soa_url2.save() self.soa_desc = ScrapedObjAttr(name='description', attr_type='S', obj_class=self.sc) self.soa_desc.save() self.soa_desc2 = ScrapedObjAttr(name='description2', attr_type='S', obj_class=self.sc) self.soa_desc2.save() self.soa_es_1 = ScrapedObjAttr(name='extra_standard_1', attr_type='S', obj_class=self.sc, save_to_db=False) self.soa_es_1.save() self.scraper = Scraper( name='Event Scraper', scraped_obj_class=self.sc, status='A', ) self.scraper.save() self.se_base = ScraperElem(scraped_obj_attr=self.soa_base, scraper=self.scraper, x_path='//ul/li', request_page_type='MP') self.se_base.save() self.se_title = ScraperElem(scraped_obj_attr=self.soa_title, scraper=self.scraper, x_path='a/text()', request_page_type='MP') self.se_title.save() self.se_url = ScraperElem(scraped_obj_attr=self.soa_url, scraper=self.scraper, x_path='a/@href', request_page_type='MP') self.se_url.save() self.se_desc = ScraperElem( scraped_obj_attr=self.soa_desc, scraper=self.scraper, x_path='//div/div[@class="description"]/text()', request_page_type='DP1', mandatory=False) self.se_desc.save() self.se_es_1 = ScraperElem(scraped_obj_attr=self.soa_es_1, scraper=self.scraper, x_path='a/text()', request_page_type='MP') self.se_es_1.save() self.rpt_mp = RequestPageType(page_type='MP', scraper=self.scraper) self.rpt_mp.save() self.rpt_dp1 = RequestPageType(page_type='DP1', scraper=self.scraper, scraped_obj_attr=self.soa_url) self.rpt_dp1.save() self.sched_rt = SchedulerRuntime() self.sched_rt.save() self.event_website = EventWebsite( pk=1, name='Event Website', scraper=self.scraper, url=os.path.join(self.SERVER_URL, 'site_generic/event_main.html'), scraper_runtime=self.sched_rt, ) self.event_website.save() for name, signal in list(vars(signals).items()): if not name.startswith('_'): dispatcher.connect(self.record_signal, signal)
def setUp(self): if os.path.exists(self.IMG_DIR): shutil.rmtree(self.IMG_DIR) os.mkdir(self.IMG_DIR) settings.set('ITEM_PIPELINES', self.dds_settings['ITEM_PIPELINES'], priority='cmdline') settings.set('SPLASH_URL', self.dds_settings['SPLASH_URL'], priority='cmdline') settings.set('DUPEFILTER_CLASS', self.dds_settings['DUPEFILTER_CLASS'], priority='cmdline') settings.set('DOWNLOADER_MIDDLEWARES', self.dds_settings['DOWNLOADER_MIDDLEWARES'], priority='cmdline') settings.set('IMAGES_STORE', self.dds_settings['IMAGES_STORE'], priority='cmdline') if 'IMAGES_THUMBS' in self.dds_settings: settings.set('IMAGES_THUMBS', self.dds_settings['IMAGES_THUMBS'], priority='cmdline') if 'DSCRAPER_IMAGES_STORE_FORMAT' in self.dds_settings: settings.set('DSCRAPER_IMAGES_STORE_FORMAT', self.dds_settings['DSCRAPER_IMAGES_STORE_FORMAT'], priority='cmdline') settings.set('COOKIES_DEBUG', True) settings.set('LOG_LEVEL', 'DEBUG') settings.set('LOG_ENABLED', False) #self.crawler = Crawler(settings) #self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed) #self.crawler.configure() self.process = CrawlerProcess(settings) self.sc = ScrapedObjClass(name='Event') self.sc.save() self.soa_base = ScrapedObjAttr(name='base', attr_type='B', obj_class=self.sc) self.soa_base.save() self.soa_title = ScrapedObjAttr(name='title', attr_type='S', obj_class=self.sc) self.soa_title.save() self.soa_url = ScrapedObjAttr(name='url', attr_type='U', obj_class=self.sc, id_field=True) self.soa_url.save() self.soa_url2 = ScrapedObjAttr(name='url2', attr_type='U', obj_class=self.sc) self.soa_url2.save() self.soa_desc = ScrapedObjAttr(name='description', attr_type='S', obj_class=self.sc) self.soa_desc.save() self.soa_desc2 = ScrapedObjAttr(name='description2', attr_type='S', obj_class=self.sc) self.soa_desc2.save() self.soa_es_1 = ScrapedObjAttr(name='extra_standard_1', attr_type='S', obj_class=self.sc, save_to_db=False) self.soa_es_1.save() self.scraper = Scraper(name='Event Scraper', scraped_obj_class=self.sc, status='A',) self.scraper.save() self.se_base = ScraperElem(scraped_obj_attr=self.soa_base, scraper=self.scraper, x_path='//ul/li', request_page_type='MP') self.se_base.save() self.se_title = ScraperElem(scraped_obj_attr=self.soa_title, scraper=self.scraper, x_path='a/text()', request_page_type='MP') self.se_title.save() self.se_url = ScraperElem(scraped_obj_attr=self.soa_url, scraper=self.scraper, x_path='a/@href', request_page_type='MP') self.se_url.save() self.se_desc = ScraperElem(scraped_obj_attr=self.soa_desc, scraper=self.scraper, x_path='//div/div[@class="description"]/text()', request_page_type='DP1', mandatory=False) self.se_desc.save() self.se_es_1 = ScraperElem(scraped_obj_attr=self.soa_es_1, scraper=self.scraper, x_path='a/text()', request_page_type='MP') self.se_es_1.save() self.rpt_mp = RequestPageType(page_type='MP', scraper=self.scraper) self.rpt_mp.save() self.rpt_dp1 = RequestPageType(page_type='DP1', scraper=self.scraper, scraped_obj_attr=self.soa_url) self.rpt_dp1.save() self.sched_rt = SchedulerRuntime() self.sched_rt.save() self.event_website = EventWebsite(pk=1, name='Event Website', scraper=self.scraper, url=os.path.join(self.SERVER_URL, 'site_generic/event_main.html'), scraper_runtime=self.sched_rt,) self.event_website.save() for name, signal in list(vars(signals).items()): if not name.startswith('_'): dispatcher.connect(self.record_signal, signal)