def fetch_votes_session(self, session, list_url, remote_session_id): session.links[EN][sources.NAME_HOC_VOTES[EN]] = url_tweak( list_url, update={"sessionId": remote_session_id}) session.links[FR][sources.NAME_HOC_VOTES[FR]] = get_french_parl_url( session.links[EN][sources.NAME_HOC_VOTES[EN]], BeautifulSoup( fetch_url(session.links[EN][sources.NAME_HOC_VOTES[EN]]), "lxml"), ) session.save() parl_soup = BeautifulSoup( fetch_url(url_tweak( "http://www.ourcommons.ca/Parliamentarians/en/HouseVotes/ExportVotes?output=XML", update={"sessionId": remote_session_id}, ), use_cache=session.parliament.number < 42), "lxml") for overview in tqdm( parl_soup.find_all( "voteparticipant" ), # Oddly named considering the previous format we found this in desc=str(session), unit="vote", ): self.fetch_vote(overview, session)
def fetch_general_election(self, parliament): logger.debug("Fetching general election, {}".format(parliament)) url = "https://lop.parl.ca/About/Parliament/FederalRidingsHistory/hfer.asp?Search=Gres&genElection={}".format( parliament.number) election = models.GeneralElection( number=parliament.number, parliament=parliament, links={ EN: { sources.NAME_LOP_GENERAL_ELECTION[EN]: url_tweak(url, update={"Language": sources.LANG_LOP[EN]}), sources.NAME_WIKI[EN]: "https://en.wikipedia.org/wiki/Canadian_federal_election,_{}" .format(self.general_election_data[parliament.number] ["date"].year), }, FR: { sources.NAME_LOP_GENERAL_ELECTION[FR]: url_tweak(url, update={"Language": sources.LANG_LOP[FR]}), sources.NAME_WIKI[FR]: "https://fr.wikipedia.org/wiki/Élections_fédérales_canadiennes_de_{}" .format(self.general_election_data[parliament.number] ["date"].year) }, }, **self.general_election_data[parliament.number], ) election.save()
def fetch_hoc_committees_session(self, session, session_url): for link in tqdm( BeautifulSoup( fetch_url(session_url), "html.parser", ).select(".committees-list .accordion-content a"), desc=str(session), unit="committee", ): committee_url = { EN: url_tweak(urljoin(session_url, link.attrs["href"])) } committee = models.Committee( session=session, chamber=models.Committee.CHAMBER_HOC, ) for lang in (EN, FR): soup = BeautifulSoup(fetch_url(committee_url[lang]), "html.parser") committee.names[lang][sources.NAME_PARL_COMMITTEE[ lang]] = soup.select(".institution-brand")[0].text committee.names[lang][ sources.NAME_PARL_COMMITTEE_CODE[lang]] = soup.select( ".header-title.current-committee-profile")[0].text committee.links[lang][ sources.NAME_PARL_COMMITTEE[lang]] = committee_url[lang] if not committee.slug: if "Joint" in committee.names[lang][ sources.NAME_PARL_COMMITTEE[lang]]: committee.chamber = models.Committee.CHAMBER_JOINT committee.slug = self.get_slug(committee) committee_url[FR] = get_french_parl_url( committee_url[lang], soup) committee.save()
def augment_election_wiki(self, election): soup = BeautifulSoup(fetch_url(url_tweak( election.links[EN][sources.NAME_WIKI[EN]], update={"action": "edit"}, )), "html.parser") # Get the info box page_source = soup.select("#wpTextbox1")[0].text infobox_lines = re.search("{{Infobox election\n(.*?)\n}}", page_source, re.S | re.I).groups()[0].splitlines() infobox = {} infobox["parties"] = [] for key, value in [ line[2:].split("=", 1) for line in infobox_lines if line.startswith("| ") ]: key = key.strip() value = value.strip() try: party_place = int(key[-1]) - 1 while len(infobox["parties"]) <= party_place: infobox["parties"].append({}) infobox["parties"][party_place][key[:-1]] = value except ValueError: infobox[key] = value election.wiki_info_box = infobox election.save()
def augment_ridings_ec(self): for row in tqdm( BeautifulSoup( fetch_url( url_tweak( "http://www.elections.ca/Scripts/vis/SearchProvinces?PROV=CA&PROVID=99999&QID=-1&PAGEID=20", update={"L": sources.LANG_EC[EN]})), "html.parser").select("table tr")): cells = row.find_all("td", recursive=False) if cells: riding = models.Riding.objects.get(slug=slugify("{} {}".format( cells[1].text, cells[0].text, ))) riding.electoral_district_number = parse_qs( urlparse(cells[0].a.attrs["href"]).query)["ED"][0] self.cached_ridings[riding.electoral_district_number] = riding riding.save() for riding in tqdm( models.Riding.objects.filter( electoral_district_number__isnull=False), desc="Augment Ridings, Elections Canada", unit="riding", ): self.augment_riding_ec(riding)
def fetch_provinces(self): url = url_tweak(self.ROOT_URL, update={"Language": sources.LANG_LOP[EN]}) for link in tqdm( BeautifulSoup( fetch_url(url), "html.parser", ).select("#ctl00_pnlContent a"), desc="Fetch Provinces, LoP (EN)", unit="province", ): if link.attrs.get("id", "").startswith("ctl00_cphContent_repProvinces_"): province, created = models.Province.objects.get_or_create( slug=slugify(link.text.strip())) url_en = url_tweak( urljoin(url, link.attrs["href"]), remove=("MenuID", "MenuQuery"), update={"Section": "All"}, ) self.augment_province(province, EN, url_en) url = url_tweak(self.ROOT_URL, update={"Language": FR}) for link in tqdm( BeautifulSoup( fetch_url(url), "html.parser", ).select("#ctl00_pnlContent a"), desc="Fetch Provinces, LoP (FR)", unit="province", ): if link.attrs.get("id", "").startswith("ctl00_cphContent_repProvinces_"): url_fr = url_tweak( urljoin(url, link.attrs["href"]), remove=("MenuID", "MenuQuery"), update={"Section": "All"}, ) province = models.Province.objects.get( links__contains=url_tweak( url_fr, update={"Language": sources.LANG_LOP[EN]}, )) self.augment_province(province, FR, url_fr)
def handle(self, *args, **options): if options["verbosity"] > 1: logger.setLevel(logging.DEBUG) cached_parties = get_cached_dict(models.Party.objects.all()) list_url = "https://lop.parl.ca/parlinfo/Lists/Party.aspx" for lang in (EN, FR): for a in tqdm( BeautifulSoup( fetch_url( url_tweak( list_url, update={"Language": sources.LANG_LOP[lang]})), "html.parser").select("td > a"), desc="Augment Parties, LoP", unit="party", ): if "_lnkParty_" not in a.attrs.get("id", ""): continue url = url_tweak( urljoin(list_url, a.attrs["href"]), update={"Section": "ALL"}, remove=("MenuID", "MenuQuery"), ) lop_item_code = sources.LOP_CODE.search(url).group().lower() party = models.Party.objects.filter( lop_item_code=lop_item_code).first() if not party: name = sources.WHITESPACE.sub(" ", a.text.strip()) name = LOP_LIST_MAPPING.get(name, name) if name is None: continue party = get_cached_obj(cached_parties, name) party.links[lang][sources.NAME_LOP_PARTY[lang]] = url party.names[lang][ sources.NAME_LOP_PARTY[lang]] = a.text.strip() party.lop_item_code = sources.LOP_CODE.search( url).group().lower() soup = BeautifulSoup(fetch_url(url), "html.parser") for link in soup.select("#ctl00_cphContent_dataLinks a"): party.links[lang][sources.AVAILABILITY_WARNINGS.sub( "", link.text.strip())] = link.attrs["href"] party.save()
def fetch_riding(self, riding, url): for lang in (EN, FR): riding.links[lang][ sources.NAME_LOP_RIDING_HISTORY[lang]] = url_tweak( url, update={"Language": sources.LANG_LOP[lang]}) try: fetch_url( riding.links[lang][sources.NAME_LOP_RIDING_HISTORY[lang]]) except Exception as e: logger.exception(e) riding.save() self.cached_ridings[riding.slug] = riding
def augment_riding(self, riding): try: for lang in (FR, EN): url = riding.links[lang][sources.NAME_LOP_RIDING_HISTORY[lang]] soup = BeautifulSoup(fetch_url(url), "html.parser") riding.names[lang][sources.NAME_LOP_RIDING_HISTORY[ lang]] = soup.select("h4")[0].text.split(", ")[0] except (KeyError, FetchFailure, FetchSuppressed) as e: logger.exception(e) return riding.save() for tag_id in ("#previous", "#became"): related_ridings = soup.select(tag_id) if related_ridings: for link in related_ridings[0].parent.select("a"): match = re.search( r"^(?P<name>.*) \((?P<province>.*)\)\((?P<daterange>.*)\)", link.text).groupdict() riding_slug = slugify("{province}-{name}".format(**match)) try: related_riding = get_cached_obj( self.cached_ridings, riding_slug) except AssertionError: province = get_cached_obj(self.cached_provinces, match["province"]) related_riding, created = models.Riding.objects.get_or_create( slug=riding_slug, province=province) logger.debug("Auxilliary riding detected: {}".format( riding_slug)) for lang in (EN, FR): if sources.NAME_LOP_RIDING_HISTORY[ lang] not in related_riding.links[lang]: related_riding.links[lang][ sources. NAME_LOP_RIDING_HISTORY[lang]] = url_tweak( urljoin(url, link.attrs["href"]), update={ "Language": sources.LANG_LOP[lang] }, ) related_riding.names[lang][ sources. NAME_LOP_RIDING_HISTORY[lang]] = BeautifulSoup( fetch_url(related_riding.links[lang][ sources.NAME_LOP_RIDING_HISTORY[lang]] ), "html.parser", ).select("h4")[0].text.split(", ")[0] related_riding.save() riding.related_historically.add(related_riding)
def fetch_by_elections(self, parliament): logger.debug("Fetching by-elections, {}".format(parliament)) url = "https://lop.parl.ca/About/Parliament/FederalRidingsHistory/hfer.asp?Language=E&Search=Bres&genElection={}".format( parliament.number) # Caching for later use soup = BeautifulSoup(fetch_url(url), "html.parser") dates = set() for row in soup.select(".rid"): dates.add( dateparse( LOP_ROW_RIDING.search( row.text.strip()).groupdict()["date"])) for date in dates: models.ByElection.objects.get_or_create( slug=f"{parliament.number}-{date}", parliament=parliament, date=date, links={ EN: { sources.NAME_LOP_BY_ELECTION[EN]: url_tweak( url, remove=("genElection", ), update={"byElection": date.strftime("%Y/%m/%d")}) }, FR: { sources.NAME_LOP_BY_ELECTION[FR]: url_tweak(url, remove=("genElection", ), update={ "byElection": date.strftime("%Y/%m/%d"), "Language": sources.LANG_LOP[FR] }) }, }, )
def fetch_parliaments(self): url = "https://lop.parl.ca/parlinfo/Lists/Parliament.aspx" for link in tqdm( BeautifulSoup( fetch_url(url), "html.parser", ).select("#ctl00_cphContent_ctl00_grdParliamentList td > a"), desc="Fetch Parliaments, LoP", unit="parliament", ): parliament, created = models.Parliament.objects.get_or_create( number=int(REVERSE_ORDINAL.sub(r"\1", link.text)), ) if created or parliament.number >= 42: url = url_tweak( urljoin(url, link.attrs["href"]), remove=("MenuID", "MenuQuery"), update={"Section": "All"}, ) parliament.links = { EN: {sources.NAME_WIKI[EN]: "https://en.wikipedia.org/wiki/{}_Canadian_Parliament".format(inflector.ordinal(parliament.number))}, FR: {sources.NAME_WIKI[FR]: "https://fr.wikipedia.org/wiki/{}{}_législature_du_Canada".format(parliament.number, "re" if parliament.number == 1 else "e")}, } for lang in (EN, FR): parliament.links[lang][sources.NAME_LOP_PARLIAMENT[lang]] = url_tweak(url, update={"Language": sources.LANG_LOP[lang]}) if parliament.number <= 35: parliament.links[lang][sources.NAME_CANADIANA[lang]] = "http://parl.canadiana.ca/search?usrlang={}&lang={}&identifier=P{}".format( sources.LANG_CANADIANA_UI[lang], sources.LANG_CANADIANA_CONTENT[lang], parliament.number, ) parliament.seats = int(BeautifulSoup( fetch_url(parliament.links[EN][sources.NAME_LOP_PARLIAMENT[EN]]), "html.parser", ).select("#ctl00_cphContent_ctl06_pnlSectionPartyStandingsContent .GridRows")[0].contents[-1].text) parliament.save()
def fetch_hoc_committees(self): list_url = "http://www.ourcommons.ca/Committees/en/List" for link in tqdm( BeautifulSoup( fetch_url(list_url), "html.parser", ).select(".session-selector"), desc="Fetch Committees, HoC", unit="session", ): querydict = parse_qs(urlparse(link.attrs["href"]).query) self.fetch_hoc_committees_session( Session.objects.get(parliament__number=querydict["parl"][0], number=querydict["session"][0]), url_tweak(urljoin(list_url, link.attrs["href"])), )
def fetch_senate_committees(self): list_url = "https://sencanada.ca/en/committees/" for link in tqdm( BeautifulSoup( fetch_url(list_url), "html.parser", ).select(".session-dropdown-session a"), desc="Fetch Committees, Senate", unit="session", ): parliament_number, session_number = link.attrs["href"].strip( "/").rsplit("/", 1)[1].split("-") self.fetch_senate_committees_session( Session.objects.get(parliament__number=parliament_number, number=session_number), url_tweak(urljoin(list_url, link.attrs["href"])), )
def fetch_parliamentarian(self, slug, name, lang_naive_url): parliamentarian, created = models.Parliamentarian.objects.get_or_create( slug=slug) if not created: return for lang in (EN, FR): parliamentarian.names[lang][ sources.NAME_LOP_PARLIAMENT[lang]] = name url = url_tweak(lang_naive_url, update={"Language": sources.LANG_LOP[lang]}) parliamentarian.links[lang][ sources.NAME_LOP_PARLIAMENTARIAN[lang]] = url soup = BeautifulSoup(fetch_url(url), "html.parser") parliamentarian.names[lang][sources.NAME_LOP_PARLIAMENTARIAN[ lang]] = sources.WHITESPACE.sub( " ", soup.select("#ctl00_cphContent_lblTitle")[0].text) for link in soup.select("#ctl00_cphContent_dataLinks a"): parliamentarian.links[lang][sources.AVAILABILITY_WARNINGS.sub( "", link.text.strip())] = link.attrs["href"] try: parliamentarian.lop_item_code = sources.LOP_CODE.search( url).group().lower() parliamentarian.birthdate = soup.select( "#ctl00_cphContent_DateOfBirthData")[0].text.strip().replace( ".", "-") except: pass # Download the parliamentarian's photo if they have one photo_url = urljoin( url, soup.select("#ctl00_cphContent_imgParliamentarianPicture") [0].attrs["src"]) code = sources.LOP_CODE.search(photo_url).group().lower() if code != "00000000-0000-0000-0000-000000000000": filename = "{}.jpg".format(code) filepath = parliamentarian.photo.field.upload_to(None, filename) if os.path.exists(os.path.join(settings.MEDIA_ROOT, filepath)): parliamentarian.photo = filepath else: parliamentarian.photo.save( filename, ContentFile(requests.get(photo_url).content)) parliamentarian.save()
def fetch_bills_session(self, session): cached_committees = get_cached_dict(models.Committee.objects.filter(session=session)) url = "http://www.parl.ca/LegisInfo/Home.aspx?download=xml&ParliamentSession={}-{}".format(session.parliament.number, session.number) soup = BeautifulSoup(fetch_url(url, use_cache=session.parliament.number < 42), "lxml") for bill_soup in tqdm( soup.find_all("bill"), desc=str(session), unit="bill", ): bill_number = bill_soup.select("billnumber")[0] bill_number = "-".join(filter(None, ( bill_number.attrs["prefix"], bill_number.attrs["number"], bill_number.get("suffix", None), ))) bill = models.Bill( session=session, slug=slugify("{}-{}".format( session.slug, bill_number, )), ) for lang in (EN, FR): bill.links[lang][sources.NAME_LEGISINFO[lang]] = url_tweak( "http://www.parl.gc.ca/LegisInfo/BillDetails.aspx", update={ "billId": bill_soup.attrs["id"], "Language": sources.LANG_LEGISINFO_UI[lang], }, ) bill.names[lang][sources.NAME_LEGISINFO_NUMBER[lang]] = bill_number bill.names[lang][sources.NAME_LEGISINFO_TITLE[lang]] = bill_soup.select("billtitle > title[language={}]".format(sources.LANG_LEGISINFO_XML[lang]))[0].text title_short = bill_soup.select("shorttitle > title[language={}]".format(sources.LANG_LEGISINFO_XML[lang]))[0].text if title_short: bill.names[lang][sources.NAME_LEGISINFO_TITLE_SHORT[lang]] = title_short bill.save() for event_soup in bill_soup.select("event"): try: committee_soup = bill_soup.select("committee[accronym]")[0] # They misspelled "acronym" in their XML code = committee_soup.attrs["accronym"] if code != "WHOL": bill.committees.add(get_cached_obj(cached_committees, code)) except IndexError: pass
def parse_session(self, session): session_url = url_tweak( "http://www.ourcommons.ca/DocumentViewer/en/SessionPublicationCalendarsWidget?organization=HOC&publicationTypeId=37", update={ "parliament": session.parliament.number, "session": session.number }, ) for sitting_link in tqdm( BeautifulSoup( fetch_url( session_url, use_cache=session.parliament.number < 42, ), "html.parser").select("td a"), desc=str(session), unit="sitting", ): self.parse_sitting_url( urljoin(session_url, sitting_link.attrs["href"]), session)
def fetch_parliamentarians(self, parliament): logger.debug("Fetch parliamentarians, {}".format(parliament)) url = parliament.links[EN][sources.NAME_LOP_PARLIAMENT[EN]] for link in tqdm( BeautifulSoup( fetch_url(url), "html.parser").select("a[href^=Parliamentarian]"), desc=str(parliament), unit="parliamentarian", ): # We slugify the parliamentarian's name to disambiguate # names like "Marcel Masse" and "Marcel Massé" self.cache_parliamentarians[slugify(link.text)][url_tweak( urljoin(url, link.attrs["href"]), update={ "MoreInfo": "True", "Section": "All", }, )] = link.text
def handle(self, *args, **options): if options["verbosity"] > 1: logger.setLevel(logging.DEBUG) cached_parties = get_cached_dict(models.Party.objects.all()) url = "http://www.elections.ca/content.aspx?dir=par&document=index§ion=pol" for lang in (EN, FR): url_lang = url_tweak(url, update={"lang": sources.LANG_EC[lang]}) ec_soup = BeautifulSoup(fetch_url(url_lang), "html.parser") for h3 in ec_soup.select("h3.partytitle"): name = h3.text.strip() name_short = h3.attrs["id"] name = EC_MAPPING.get(name, name) try: party = get_cached_obj(cached_parties, name) except AssertionError: party = get_cached_obj(cached_parties, name_short) party.names[lang][sources.NAME_EC[lang]] = name party.names[lang][sources.NAME_EC_SHORT[lang]] = name_short party.links[lang][sources.NAME_EC[lang]] = "{}#{}".format( url_lang, name_short) party.save() cached_parties[name].add(party) cached_parties[name_short].add(party)
def fetch_senate_committees_session(self, session, session_url): for link in tqdm( BeautifulSoup( fetch_url(session_url), "html.parser").select(".committee-list-boxes-wrapper a"), desc=str(session), unit="committee", ): committee_url = { EN: url_tweak(urljoin(session_url, link.attrs["href"])) } if link.select(".joint-committee-list-boxes"): logger.debug( "Skipping {} (broken, reported, joint committees are covered in HoC anyway)" .format(committee_url[EN])) continue committee = models.Committee( session=session, chamber=models.Committee.CHAMBER_SEN, ) for lang in (EN, FR): soup = BeautifulSoup(fetch_url(committee_url[lang]), "html.parser") committee.names[lang][ sources.NAME_PARL_COMMITTEE[lang]] = soup.select( "meta[name=dc.description]")[0].attrs["content"] committee.names[lang][sources.NAME_PARL_COMMITTEE_CODE[ lang]] = committee_url[lang].strip("/").split( "/")[-2].upper() committee.links[lang][ sources.NAME_PARL_COMMITTEE[lang]] = committee_url[lang] if not committee.slug: committee.slug = self.get_slug(committee) committee_url[FR] = get_french_parl_url( committee_url[lang], soup) committee.save()
def augment_parties_by_parliament_file(self, parliament, cached_parties): for lang in (EN, FR): url = parliament.links[lang][sources.NAME_LOP_PARLIAMENT[lang]] soup = BeautifulSoup(fetch_url(url), "html.parser") for row in tqdm( soup.select("#ctl00_cphContent_ctl04_repGeneralElection_ctl00_grdMembers tr"), desc=str(parliament), unit="party", ): cells = row.find_all("td", recursive=False) if cells: if (parliament.number, cells[0].a.text) in BYPASSED: continue parliamentarian_name = sources.WHITESPACE.sub(" ", cells[0].a.text.strip()) party_name = sources.WHITESPACE.sub(" ", cells[2].text.strip()) if parliament.number == 13: if party_name == "Unionist (Liberal)": party_name = "Unionist (Conservative and Liberal)" elif party_name == "Union (libéral)": party_name = "Union (conservateurs et libéraux)" if party_name.startswith(("Independent", "Indépendant")): continue lop_item_code = sources.LOP_CODE.search(cells[0].a.attrs["href"]).group().lower() election_candidates = ElectionCandidate.objects.filter( election_riding__general_election=parliament.general_election, parliamentarian__lop_item_code=lop_item_code, ) party = election_candidates.first().party # https://lop.parl.ca/About/Parliament/FederalRidingsHistory/hfer.asp?Language=E&Search=C says # "Some discrepancies in data may appear. Data appearing in the Federal Member Profile (biography) # should be considered the authoritative source." So we might need to change the party noted from HFER # to that detected through the Parliament file. if not party: if party_name in cached_parties: logger.debug("{}, {}, shows HFER as an independent, but now shows up as {}".format( parliament, parliamentarian_name, party_name, )) election_candidates.update(party=cached_parties[party_name]) elif (party.slug, party_name) in NEGATIVE: if lang == EN: logger.debug("{}, {}, shows in HFER as {}, but PFile as {}".format( parliament, parliamentarian_name, party.names[lang][sources.NAME_LOP_RIDING_HISTORY[lang]], party_name, )) elif sources.NAME_LOP_PARLIAMENT[lang] not in party.names[lang]: party.names[lang][sources.NAME_LOP_PARLIAMENT[lang]] = party_name if cells[2].a.attrs.get("href", None): party.links[lang][sources.NAME_LOP_PARTY[lang]] = url_tweak( urljoin(url, cells[2].a.attrs["href"]), update={"Section": "All"}, ) party.save() logger.debug("{}, mapping {} to {} via {}".format(parliament, party.slug, party_name, parliamentarian_name)) cached_parties[party_name] = party # https://lop.parl.ca/About/Parliament/FederalRidingsHistory/hfer.asp?Language=E&Search=C says # "Some discrepancies in data may appear. Data appearing in the Federal Member Profile (biography) # should be considered the authoritative source." So we might need to change the party noted from HFER # to that detected through the Parliament file. elif party.names[lang][sources.NAME_LOP_PARLIAMENT[lang]] != party_name: logger.debug("{}, {}, shows HFER as {}, known previous in PFile as {}, but now shows up as {}".format( parliament, parliamentarian_name, party.names[lang][sources.NAME_LOP_RIDING_HISTORY[lang]], party.names[lang][sources.NAME_LOP_PARLIAMENT[lang]], party_name, )) election_candidates.update(party=party) government_party_name = sources.WHITESPACE.sub(" ", soup.select("#ctl00_cphContent_GoverningPartyData")[0].text.strip()) parliament.government_party = cached_parties[government_party_name] parliament.save()