def setUp(self):
        if os.path.exists(self.IMG_DIR):
            shutil.rmtree(self.IMG_DIR)
        os.mkdir(self.IMG_DIR)

        settings.set('ITEM_PIPELINES',
                     self.dds_settings['ITEM_PIPELINES'],
                     priority='cmdline')
        settings.set('SPLASH_URL',
                     self.dds_settings['SPLASH_URL'],
                     priority='cmdline')
        settings.set('DUPEFILTER_CLASS',
                     self.dds_settings['DUPEFILTER_CLASS'],
                     priority='cmdline')
        settings.set('DOWNLOADER_MIDDLEWARES',
                     self.dds_settings['DOWNLOADER_MIDDLEWARES'],
                     priority='cmdline')
        settings.set('IMAGES_STORE',
                     self.dds_settings['IMAGES_STORE'],
                     priority='cmdline')
        if 'IMAGES_THUMBS' in self.dds_settings:
            settings.set('IMAGES_THUMBS',
                         self.dds_settings['IMAGES_THUMBS'],
                         priority='cmdline')
        if 'DSCRAPER_IMAGES_STORE_FORMAT' in self.dds_settings:
            settings.set('DSCRAPER_IMAGES_STORE_FORMAT',
                         self.dds_settings['DSCRAPER_IMAGES_STORE_FORMAT'],
                         priority='cmdline')

        settings.set('COOKIES_DEBUG', True)
        self.crawler = Crawler(settings)
        self.crawler.signals.connect(reactor.stop,
                                     signal=signals.spider_closed)
        self.crawler.configure()

        self.sc = ScrapedObjClass(name='Event')
        self.sc.save()
        self.soa_base = ScrapedObjAttr(name=u'base',
                                       attr_type='B',
                                       obj_class=self.sc)
        self.soa_base.save()
        self.soa_title = ScrapedObjAttr(name=u'title',
                                        attr_type='S',
                                        obj_class=self.sc)
        self.soa_title.save()
        self.soa_url = ScrapedObjAttr(name=u'url',
                                      attr_type='U',
                                      obj_class=self.sc,
                                      id_field=True)
        self.soa_url.save()
        self.soa_desc = ScrapedObjAttr(name=u'description',
                                       attr_type='S',
                                       obj_class=self.sc)
        self.soa_desc.save()

        self.scraper = Scraper(
            name=u'Event Scraper',
            scraped_obj_class=self.sc,
            status='A',
        )
        self.scraper.save()

        self.se_base = ScraperElem(scraped_obj_attr=self.soa_base,
                                   scraper=self.scraper,
                                   x_path=u'//ul/li',
                                   from_detail_page=False)
        self.se_base.save()
        self.se_title = ScraperElem(scraped_obj_attr=self.soa_title,
                                    scraper=self.scraper,
                                    x_path=u'a/text()',
                                    from_detail_page=False)
        self.se_title.save()
        self.se_url = ScraperElem(scraped_obj_attr=self.soa_url,
                                  scraper=self.scraper,
                                  x_path=u'a/@href',
                                  from_detail_page=False)
        self.se_url.save()
        self.se_desc = ScraperElem(
            scraped_obj_attr=self.soa_desc,
            scraper=self.scraper,
            x_path=u'//div/div[@class="description"]/text()',
            from_detail_page=True,
            mandatory=False)
        self.se_desc.save()

        self.sched_rt = SchedulerRuntime()
        self.sched_rt.save()

        self.event_website = EventWebsite(
            pk=1,
            name=u'Event Website',
            scraper=self.scraper,
            url=os.path.join(self.SERVER_URL, 'site_generic/event_main.html'),
            scraper_runtime=self.sched_rt,
        )
        self.event_website.save()

        for name, signal in vars(signals).items():
            if not name.startswith('_'):
                dispatcher.connect(self.record_signal, signal)
示例#2
0
    def setUp(self):
        self.sc = ScrapedObjClass(name='Event')
        self.sc.save()
        self.soa_base = ScrapedObjAttr(name=u'base',
                                       attr_type='B',
                                       obj_class=self.sc)
        self.soa_base.save()
        self.soa_title = ScrapedObjAttr(name=u'title',
                                        attr_type='S',
                                        obj_class=self.sc)
        self.soa_title.save()
        self.soa_url = ScrapedObjAttr(name=u'url',
                                      attr_type='U',
                                      obj_class=self.sc)
        self.soa_url.save()
        self.soa_desc = ScrapedObjAttr(name=u'description',
                                       attr_type='S',
                                       obj_class=self.sc)
        self.soa_desc.save()

        self.scraper = Scraper(
            name=u'Event Scraper',
            scraped_obj_class=self.sc,
            status='A',
        )
        self.scraper.save()

        self.se_base = ScraperElem(scraped_obj_attr=self.soa_base,
                                   scraper=self.scraper,
                                   x_path=u'//ul/li',
                                   from_detail_page=False)
        self.se_base.save()
        self.se_title = ScraperElem(scraped_obj_attr=self.soa_title,
                                    scraper=self.scraper,
                                    x_path=u'a/text()',
                                    from_detail_page=False)
        self.se_title.save()
        self.se_url = ScraperElem(scraped_obj_attr=self.soa_url,
                                  scraper=self.scraper,
                                  x_path=u'a/@href',
                                  from_detail_page=False)
        self.se_url.save()
        self.se_desc = ScraperElem(
            scraped_obj_attr=self.soa_desc,
            scraper=self.scraper,
            x_path=u'//div/div[@class="description"]/text()',
            from_detail_page=True,
            mandatory=False)
        self.se_desc.save()

        self.sched_rt = SchedulerRuntime()
        self.sched_rt.save()

        self.event_website = EventWebsite(
            pk=1,
            name=u'Event Website',
            scraper=self.scraper,
            url=os.path.join(self.SERVER_URL, 'site_generic/event_main.html'),
            scraper_runtime=self.sched_rt,
        )
        self.event_website.save()

        settings.overrides['ITEM_PIPELINES'] = [
            'dynamic_scraper.pipelines.DjangoImagesPipeline',
            'dynamic_scraper.pipelines.ValidationPipeline',
            'scraper.scraper_test.DjangoWriterPipeline',
        ]

        settings.overrides['IMAGES_STORE'] = os.path.join(
            self.PROJECT_ROOT, 'imgs')
        settings.overrides['IMAGES_THUMBS'] = {
            'small': (170, 170),
        }

        self.crawler = CrawlerProcess(settings)
        self.crawler.install()
        self.crawler.configure()

        for name, signal in vars(signals).items():
            if not name.startswith('_'):
                dispatcher.connect(self.record_signal, signal)
    def setUp(self):
        if os.path.exists(self.IMG_DIR):
            shutil.rmtree(self.IMG_DIR)
        os.mkdir(self.IMG_DIR)

        settings.set('ITEM_PIPELINES',
                     self.dds_settings['ITEM_PIPELINES'],
                     priority='cmdline')
        settings.set('SPLASH_URL',
                     self.dds_settings['SPLASH_URL'],
                     priority='cmdline')
        settings.set('DUPEFILTER_CLASS',
                     self.dds_settings['DUPEFILTER_CLASS'],
                     priority='cmdline')
        settings.set('DOWNLOADER_MIDDLEWARES',
                     self.dds_settings['DOWNLOADER_MIDDLEWARES'],
                     priority='cmdline')
        settings.set('IMAGES_STORE',
                     self.dds_settings['IMAGES_STORE'],
                     priority='cmdline')
        if 'IMAGES_THUMBS' in self.dds_settings:
            settings.set('IMAGES_THUMBS',
                         self.dds_settings['IMAGES_THUMBS'],
                         priority='cmdline')
        if 'DSCRAPER_IMAGES_STORE_FORMAT' in self.dds_settings:
            settings.set('DSCRAPER_IMAGES_STORE_FORMAT',
                         self.dds_settings['DSCRAPER_IMAGES_STORE_FORMAT'],
                         priority='cmdline')

        settings.set('COOKIES_DEBUG', True)
        settings.set('LOG_LEVEL', 'DEBUG')
        settings.set('LOG_ENABLED', False)

        #self.crawler = Crawler(settings)
        #self.crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
        #self.crawler.configure()

        self.process = CrawlerProcess(settings)

        self.sc = ScrapedObjClass(name='Event')
        self.sc.save()
        self.soa_base = ScrapedObjAttr(name='base',
                                       attr_type='B',
                                       obj_class=self.sc)
        self.soa_base.save()
        self.soa_title = ScrapedObjAttr(name='title',
                                        attr_type='S',
                                        obj_class=self.sc)
        self.soa_title.save()
        self.soa_url = ScrapedObjAttr(name='url',
                                      attr_type='U',
                                      obj_class=self.sc,
                                      id_field=True)
        self.soa_url.save()
        self.soa_url2 = ScrapedObjAttr(name='url2',
                                       attr_type='U',
                                       obj_class=self.sc)
        self.soa_url2.save()
        self.soa_desc = ScrapedObjAttr(name='description',
                                       attr_type='S',
                                       obj_class=self.sc)
        self.soa_desc.save()
        self.soa_desc2 = ScrapedObjAttr(name='description2',
                                        attr_type='S',
                                        obj_class=self.sc)
        self.soa_desc2.save()
        self.soa_es_1 = ScrapedObjAttr(name='extra_standard_1',
                                       attr_type='S',
                                       obj_class=self.sc,
                                       save_to_db=False)
        self.soa_es_1.save()

        self.scraper = Scraper(
            name='Event Scraper',
            scraped_obj_class=self.sc,
            status='A',
        )
        self.scraper.save()

        self.se_base = ScraperElem(scraped_obj_attr=self.soa_base,
                                   scraper=self.scraper,
                                   x_path='//ul/li',
                                   request_page_type='MP')
        self.se_base.save()
        self.se_title = ScraperElem(scraped_obj_attr=self.soa_title,
                                    scraper=self.scraper,
                                    x_path='a/text()',
                                    request_page_type='MP')
        self.se_title.save()
        self.se_url = ScraperElem(scraped_obj_attr=self.soa_url,
                                  scraper=self.scraper,
                                  x_path='a/@href',
                                  request_page_type='MP')
        self.se_url.save()
        self.se_desc = ScraperElem(
            scraped_obj_attr=self.soa_desc,
            scraper=self.scraper,
            x_path='//div/div[@class="description"]/text()',
            request_page_type='DP1',
            mandatory=False)
        self.se_desc.save()
        self.se_es_1 = ScraperElem(scraped_obj_attr=self.soa_es_1,
                                   scraper=self.scraper,
                                   x_path='a/text()',
                                   request_page_type='MP')
        self.se_es_1.save()

        self.rpt_mp = RequestPageType(page_type='MP', scraper=self.scraper)
        self.rpt_mp.save()
        self.rpt_dp1 = RequestPageType(page_type='DP1',
                                       scraper=self.scraper,
                                       scraped_obj_attr=self.soa_url)
        self.rpt_dp1.save()

        self.sched_rt = SchedulerRuntime()
        self.sched_rt.save()

        self.event_website = EventWebsite(
            pk=1,
            name='Event Website',
            scraper=self.scraper,
            url=os.path.join(self.SERVER_URL, 'site_generic/event_main.html'),
            scraper_runtime=self.sched_rt,
        )
        self.event_website.save()

        for name, signal in list(vars(signals).items()):
            if not name.startswith('_'):
                dispatcher.connect(self.record_signal, signal)