def scrape_bill(self, session, bill_url): page = self.get(bill_url).text page = lxml.html.fromstring(page) page.make_links_absolute(bill_url) try: bill_id = page.xpath('//span[@id="lblBillNumber"]/a[1]')[0].text except IndexError: self.logger.warning("Something is wrong with bill page, skipping.") return secondary_bill_id = page.xpath('//span[@id="lblCompNumber"]/a[1]') # checking if there is a matching bill if secondary_bill_id: secondary_bill_id = secondary_bill_id[0].text # swap ids if * is in secondary_bill_id if "*" in secondary_bill_id: bill_id, secondary_bill_id = secondary_bill_id, bill_id secondary_bill_id = secondary_bill_id.strip() secondary_bill_id = secondary_bill_id.replace(" ", " ") bill_id = bill_id.replace("*", "").replace(" ", " ").strip() if "B" in bill_id: bill_type = "bill" elif "JR" in bill_id: bill_type = "joint resolution" elif "R" in bill_id: bill_type = "resolution" primary_chamber = "lower" if "H" in bill_id else "upper" # secondary_chamber = 'upper' if primary_chamber == 'lower' else 'lower' title = page.xpath("//span[@id='lblAbstract']")[0].text if title is None: msg = "%s detail page was missing title info." self.logger.warning(msg % bill_id) return # bill subject subject_pos = title.find("-") subjects = [s.strip() for s in title[:subject_pos - 1].split(",")] subjects = filter(None, subjects) bill = Bill( bill_id, legislative_session=session, chamber=primary_chamber, title=title, classification=bill_type, ) for subject in subjects: bill.add_subject(subject) if secondary_bill_id: bill.add_identifier(secondary_bill_id) if page.xpath('//span[@id="lblCompNumber"]/a'): companion_id = (page.xpath('//span[@id="lblCompNumber"]/a') [0].text_content().strip()) bill.add_related_bill( identifier=companion_id, legislative_session=session, relation_type="companion", ) bill.add_source(bill_url) # Primary Sponsor sponsor = (page.xpath("//span[@id='lblBillPrimeSponsor']") [0].text_content().split("by")[-1]) sponsor = sponsor.replace("*", "").strip() if sponsor: bill.add_sponsorship(sponsor, classification="primary", entity_type="person", primary=True) # bill text btext = page.xpath("//span[@id='lblBillNumber']/a")[0] bill.add_version_link("Current Version", btext.get("href"), media_type="application/pdf") # documents summary = page.xpath('//a[contains(@href, "BillSummaryArchive")]') if summary: bill.add_document_link("Summary", summary[0].get("href")) fiscal = page.xpath('//span[@id="lblFiscalNote"]//a') if fiscal: bill.add_document_link("Fiscal Note", fiscal[0].get("href")) amendments = page.xpath('//a[contains(@href, "/Amend/")]') for amendment in amendments: bill.add_version_link( "Amendment " + amendment.text, amendment.get("href"), media_type="application/pdf", ) # amendment notes in image with alt text describing doc inside <a> amend_fns = page.xpath('//img[contains(@alt, "Fiscal Memo")]') for afn in amend_fns: bill.add_document_link(afn.get("alt"), afn.getparent().get("href"), on_duplicate="ignore") # actions atable = page.xpath("//table[@id='gvBillActionHistory']")[0] actions_from_table(bill, atable) # if there is a matching bill if secondary_bill_id: # secondary sponsor secondary_sponsor = ( page.xpath("//span[@id='lblCompPrimeSponsor']") [0].text_content().split("by")[-1]) secondary_sponsor = (secondary_sponsor.replace("*", "").replace( ")", "").strip()) # Skip black-name sponsors. if secondary_sponsor: bill.add_sponsorship( secondary_sponsor, classification="primary", entity_type="person", primary=True, ) # secondary actions if page.xpath("//table[@id='gvCoActionHistory']"): cotable = page.xpath("//table[@id='gvCoActionHistory']")[0] actions_from_table(bill, cotable) # votes yield from self.scrape_vote_events(bill, page, bill_url) bill.actions.sort(key=lambda a: a["date"]) yield bill
def test_full_bill(): create_jurisdiction() person = Person.objects.create(name="Adam Smith") lower = Organization.objects.create(jurisdiction_id="jid", name="House", classification="lower") Membership.objects.create(person_id=person.id, organization_id=lower.id) Organization.objects.create( jurisdiction_id="jid", name="Arbitrary Committee", classification="committee", parent=lower, ) oldbill = ScrapeBill( "HB 99", "1899", "Axe & Tack Tax Act", classification="tax bill", chamber="lower", ) bill = ScrapeBill("HB 1", "1900", "Axe & Tack Tax Act", classification="tax bill", chamber="lower") bill.subject = ["taxes", "axes"] bill.add_identifier("SB 9") bill.add_title("Tack & Axe Tax Act") bill.add_action("introduced in house", "1900-04-01", chamber="lower") act = bill.add_action("sent to arbitrary committee", "1900-04-04", chamber="lower") act.add_related_entity( "arbitrary committee", "organization", _make_pseudo_id(name="Arbitrary Committee"), ) bill.add_related_bill("HB 99", legislative_session="1899", relation_type="prior-session") bill.add_sponsorship( "Adam Smith", classification="extra sponsor", entity_type="person", primary=False, entity_id=_make_pseudo_id(name="Adam Smith"), ) bill.add_sponsorship("Jane Smith", classification="lead sponsor", entity_type="person", primary=True) bill.add_abstract( "This is an act about axes and taxes and tacks.", note="official", date="1969-10-20", ) bill.add_document_link("Fiscal Note", "http://example.com/fn.pdf", media_type="application/pdf") bill.add_document_link("Fiscal Note", "http://example.com/fn.html", media_type="text/html") bill.add_version_link("Fiscal Note", "http://example.com/v/1", media_type="text/html") bill.add_source("http://example.com/source") # import bill BillImporter("jid").import_data([oldbill.as_dict(), bill.as_dict()]) # get bill from db and assert it imported correctly b = Bill.objects.get(identifier="HB 1") assert b.from_organization.classification == "lower" assert b.identifier == bill.identifier assert b.title == bill.title assert b.classification == bill.classification assert b.subject == ["taxes", "axes"] assert b.abstracts.get().note == "official" assert b.abstracts.get().date == "1969-10-20" # other_title, other_identifier added assert b.other_titles.get().title == "Tack & Axe Tax Act" assert b.other_identifiers.get().identifier == "SB 9" # actions actions = list(b.actions.all()) assert len(actions) == 2 # ensure order was preserved (if this breaks it'll be intermittent) assert actions[0].organization == Organization.objects.get( classification="lower") assert actions[0].description == "introduced in house" assert actions[1].description == "sent to arbitrary committee" assert actions[1].related_entities.get( ).organization == Organization.objects.get(classification="committee") # action computed fields assert b.first_action_date == "1900-04-01" assert b.latest_action_date == "1900-04-04" assert b.latest_action_description == "sent to arbitrary committee" # related_bills were added rb = b.related_bills.get() assert rb.identifier == "HB 99" # and bill got resolved assert rb.related_bill.identifier == "HB 99" # sponsors added, linked & unlinked sponsorships = b.sponsorships.all() assert len(sponsorships) == 2 person = Person.objects.get(name="Adam Smith") for ss in sponsorships: if ss.primary: assert ss.person is None assert ss.organization is None else: assert ss.person == person # versions & documents with their links versions = b.versions.all() assert len(versions) == 1 assert versions[0].links.count() == 1 documents = b.documents.all() assert len(documents) == 1 assert documents[0].links.count() == 2 # sources assert b.sources.count() == 1