def do_scrape(self, juris, args, scrapers): # make output and cache dirs utils.makedirs(settings.CACHE_DIR) datadir = os.path.join(settings.SCRAPED_DATA_DIR, args.module) utils.makedirs(datadir) # clear json from data dir for f in glob.glob(datadir + '/*.json'): os.remove(f) report = {} # do jurisdiction jscraper = JurisdictionScraper(juris, datadir, strict_validation=args.strict, fastmode=args.fastmode) report['jurisdiction'] = jscraper.do_scrape() for scraper_name, scrape_args in scrapers.items(): ScraperCls = juris.scrapers[scraper_name] scraper = ScraperCls(juris, datadir, strict_validation=args.strict, fastmode=args.fastmode) report[scraper_name] = scraper.do_scrape(**scrape_args) return report
def test_jurisdiction_bicameral_scrape(): j = FakeJurisdiction() js = JurisdictionScraper(j, '/tmp/') objects = list(js.scrape()) obj_names = set() obj_types = defaultdict(int) for o in objects: obj_names.add(o.name) obj_types[type(o)] += 1 # ensure Jurisdiction and 5 organizations were found assert obj_names == {'Test', 'Congress', 'House', 'Senate', 'Democratic', 'Republican'} assert obj_types[FakeJurisdiction] == 1 assert obj_types[Organization] == 5
def test_jurisdiction_bicameral_scrape(): j = FakeJurisdiction() js = JurisdictionScraper(j, '/tmp/') objects = list(js.scrape()) obj_names = set() obj_types = defaultdict(int) for o in objects: obj_names.add(o.name) obj_types[type(o)] += 1 # ensure Jurisdiction and 5 organizations were found assert obj_names == {'Test', 'Congress', 'House', 'Senate'} assert obj_types[FakeJurisdiction] == 1 assert obj_types[Organization] == 3
def test_jurisdiction_unicam_scrape(): class UnicameralJurisdiction(Jurisdiction): jurisdiction_id = 'unicam' name = 'Unicameral' url = 'http://example.com' j = UnicameralJurisdiction() js = JurisdictionScraper(j, '/tmp/') objects = list(js.scrape()) # two objects, first is the Jurisdiction assert len(objects) == 2 assert objects[0] == j # ensure we made a single legislature org as well assert isinstance(objects[1], Organization) assert objects[1].classification == 'legislature' assert objects[1].sources[0]['url'] == j.url
def do_scrape(self, juris, args, scrapers): # make output and cache dirs utils.makedirs(settings.CACHE_DIR) datadir = os.path.join(settings.SCRAPED_DATA_DIR, args.module) utils.makedirs(datadir) # clear json from data dir for f in glob.glob(datadir + '/*.json'): os.remove(f) report = {} # do jurisdiction jscraper = JurisdictionScraper(juris, datadir, args.strict, args.fastmode) report['jurisdiction'] = jscraper.do_scrape() for scraper_name, scrape_args in scrapers.items(): ScraperCls = juris.scrapers[scraper_name] scraper = ScraperCls(juris, datadir, args.strict, args.fastmode) report[scraper_name] = scraper.do_scrape(**scrape_args) return report
def test_jurisdiction_unicam_scrape(): class UnicameralJurisdiction(Jurisdiction): jurisdiction_id = 'unicam' name = 'Unicameral' url = 'http://example.com' def get_organizations(self): yield Organization('Unicameral Legislature', classification='legislature') j = UnicameralJurisdiction() js = JurisdictionScraper(j, '/tmp/') objects = list(js.scrape()) # two objects, first is the Jurisdiction assert len(objects) == 2 assert objects[0] == j # ensure we made a single legislature org assert isinstance(objects[1], Organization) assert objects[1].classification == 'legislature'