def fetch_hoc_committees_session(self, session, session_url): for link in tqdm( BeautifulSoup( fetch_url(session_url), "html.parser", ).select(".committees-list .accordion-content a"), desc=str(session), unit="committee", ): committee_url = { EN: url_tweak(urljoin(session_url, link.attrs["href"])) } committee = models.Committee( session=session, chamber=models.Committee.CHAMBER_HOC, ) for lang in (EN, FR): soup = BeautifulSoup(fetch_url(committee_url[lang]), "html.parser") committee.names[lang][sources.NAME_PARL_COMMITTEE[ lang]] = soup.select(".institution-brand")[0].text committee.names[lang][ sources.NAME_PARL_COMMITTEE_CODE[lang]] = soup.select( ".header-title.current-committee-profile")[0].text committee.links[lang][ sources.NAME_PARL_COMMITTEE[lang]] = committee_url[lang] if not committee.slug: if "Joint" in committee.names[lang][ sources.NAME_PARL_COMMITTEE[lang]]: committee.chamber = models.Committee.CHAMBER_JOINT committee.slug = self.get_slug(committee) committee_url[FR] = get_french_parl_url( committee_url[lang], soup) committee.save()
def fetch_votes_session(self, session, list_url, remote_session_id): session.links[EN][sources.NAME_HOC_VOTES[EN]] = url_tweak( list_url, update={"sessionId": remote_session_id}) session.links[FR][sources.NAME_HOC_VOTES[FR]] = get_french_parl_url( session.links[EN][sources.NAME_HOC_VOTES[EN]], BeautifulSoup( fetch_url(session.links[EN][sources.NAME_HOC_VOTES[EN]]), "lxml"), ) session.save() parl_soup = BeautifulSoup( fetch_url(url_tweak( "http://www.ourcommons.ca/Parliamentarians/en/HouseVotes/ExportVotes?output=XML", update={"sessionId": remote_session_id}, ), use_cache=session.parliament.number < 42), "lxml") for overview in tqdm( parl_soup.find_all( "voteparticipant" ), # Oddly named considering the previous format we found this in desc=str(session), unit="vote", ): self.fetch_vote(overview, session)
def parse_sitting_url(self, sitting_url, session): try: sitting_number = SITTING.search(sitting_url).groups()[0].upper() sitting = models.Sitting( session=session, number=sitting_number, slug="-".join((session.slug, sitting_number.lower())), ) for lang in (EN, FR): soup = BeautifulSoup( fetch_url( sitting_url, use_cache=(session.parliament.number, int( NUMBERS.search( sitting.number).groups()[0])) < (42, 190), ), "html.parser") if lang == EN: sitting.date = dateparse( soup.select("#load-publication-selector")[0].text) for tab in soup.select(".publication-tabs > li"): if "disabled" not in tab["class"]: sitting.links[lang][", ".join( (sources.NAME_HOC[lang], tab.a.text))] = urljoin( sitting_url, tab.a.attrs.get("href", sitting_url)) if lang == EN and "Hansard" in tab.a.text: sitting.links[EN][sources.NAME_OP[ EN]] = f"https://openparliament.ca/debates/{sitting.date.year}/{sitting.date.month}/{sitting.date.day}/" xml_button = one_or_none(soup.select(".btn-export-xml")) if xml_button: sitting.links[lang][ sources.NAME_HOC_HANSARD_XML[lang]] = urljoin( sitting_url, xml_button.attrs["href"]) if lang == EN: sitting_url = get_french_parl_url(sitting_url, soup) sitting.save() except Exception as e: logger.exception(e)
def fetch_senate_committees_session(self, session, session_url): for link in tqdm( BeautifulSoup( fetch_url(session_url), "html.parser").select(".committee-list-boxes-wrapper a"), desc=str(session), unit="committee", ): committee_url = { EN: url_tweak(urljoin(session_url, link.attrs["href"])) } if link.select(".joint-committee-list-boxes"): logger.debug( "Skipping {} (broken, reported, joint committees are covered in HoC anyway)" .format(committee_url[EN])) continue committee = models.Committee( session=session, chamber=models.Committee.CHAMBER_SEN, ) for lang in (EN, FR): soup = BeautifulSoup(fetch_url(committee_url[lang]), "html.parser") committee.names[lang][ sources.NAME_PARL_COMMITTEE[lang]] = soup.select( "meta[name=dc.description]")[0].attrs["content"] committee.names[lang][sources.NAME_PARL_COMMITTEE_CODE[ lang]] = committee_url[lang].strip("/").split( "/")[-2].upper() committee.links[lang][ sources.NAME_PARL_COMMITTEE[lang]] = committee_url[lang] if not committee.slug: committee.slug = self.get_slug(committee) committee_url[FR] = get_french_parl_url( committee_url[lang], soup) committee.save()
def fetch_vote(self, overview, session): number = overview.decisiondivisionnumber.text vote = models.HouseVote( slug="-".join((session.slug, number)), number=number, result=RESULT_MAPPING[overview.decisionresultname.text], ) vote.links[EN][sources.NAME_HOC_VOTE_DETAILS[ EN]] = "http://www.ourcommons.ca/Parliamentarians/en/votes/{}/{}/{}/".format( session.parliament.number, session.number, number, ) soup = {} for lang in (EN, FR): soup[lang] = BeautifulSoup( fetch_url( vote.links[lang][sources.NAME_HOC_VOTE_DETAILS[lang]], sometimes_refetch=False), "html.parser", ) details = one_or_none(soup[lang].select(".voteDetailsText")) if details: vote.context[lang] = soup_to_text(details) if lang == EN: vote.links[FR][ sources.NAME_HOC_VOTE_DETAILS[FR]] = get_french_parl_url( vote.links[lang][sources.NAME_HOC_VOTE_DETAILS[lang]], soup[lang], ) try: vote.sitting = models.Sitting.objects.get( session=session, date=dateparse(overview.decisioneventdatetime.text), ) except Exception as e: # Sometimes the XML listings show the wrong dates. # I've contacted [email protected] about this. element = BeautifulSoup( fetch_url(vote.links[EN][sources.NAME_HOC_VOTE_DETAILS[EN]]), "html.parser", ).select("#VoteDetailsHeader .voteDetailsTopHeaderContent")[1] vote.sitting = models.Sitting.objects.get( session=session, date=dateparse(element.text.split(" - ")[1]), ) if overview.billnumbercode.text: vote.bill = models.Bill.objects.get( slug="-".join((session.slug, *overview.billnumbercode.text.split("-")))) vote.save() # Fetch the parliamentarian votes # TODO: This has been temporarily written to scrape off of HTML # as the new XML format omits party affiliation. for row in soup[EN].select( "#parlimant > tbody > tr" ): # Note the source code misspells "parliament" self.fetch_vote_participant(row, vote, soup)