def test_bill_type_setting():
    # default
    b = Bill(identifier="some bill",
             legislative_session="session",
             title="the title")
    assert b.classification == ["bill"]

    # string -> list
    b = Bill(
        identifier="some bill",
        legislative_session="session",
        title="the title",
        classification="string",
    )
    assert b.classification == ["string"]

    # list unmodified
    b = Bill(
        identifier="some bill",
        legislative_session="session",
        title="the title",
        classification=["two", "items"],
    )
    assert b.classification == ["two", "items"]

    # tuple -> list
    b = Bill(
        identifier="some bill",
        legislative_session="session",
        title="the title",
        classification=("two", "items"),
    )
    assert b.classification == ["two", "items"]
示例#2
0
def test_save_related():
    s = Scraper(juris, "/tmp/")
    p = Bill("HB 1", "2021", "Test")
    p.add_source("http://example.com")
    o = Bill("HB 2", "2021", "Test")
    o.add_source("http://example.com")
    p._related.append(o)

    with mock.patch("json.dump") as json_dump:
        s.save_object(p)

    assert json_dump.mock_calls == [
        mock.call(p.as_dict(), mock.ANY, cls=mock.ANY),
        mock.call(o.as_dict(), mock.ANY, cls=mock.ANY),
    ]
示例#3
0
def test_save_object_invalid():
    s = Scraper(juris, "/tmp/")
    p = Bill("HB 1", "2021", "Test")
    # no source, won't validate

    with pytest.raises(ValueError):
        s.save_object(p)
示例#4
0
    def scrape_bill(self, chamber, session, url):
        html = self.get(url).content
        page = lxml.html.fromstring(html)
        page.make_links_absolute(self.BASE_URL)

        if page.xpath('//h2[@style="font-size:1.3rem;"]/a[1]/text()'):
            bill_id = page.xpath('//h2[@style="font-size:1.3rem;"]/a[1]/text()')[
                0
            ].strip()
        elif page.xpath('//h2[@style="font-size:1.3rem;"]/text()'):
            bill_id = page.xpath('//h2[@style="font-size:1.3rem;"]/text()')[0].strip()
        else:
            self.warning("No bill id for {}".format(url))
            return
        title = page.xpath(
            '//dt[contains(text(), "Title")]/following-sibling::dd[1]/text()'
        )[0].strip()

        if "B" in bill_id:
            _type = ["bill"]
        elif "J" in bill_id:
            _type = ["joint resolution"]
        elif "HS" in bill_id:
            _type = ["resolution"]
        else:
            raise ValueError("unknown bill type " + bill_id)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=_type,
        )
        bill.add_source(url)

        self.scrape_bill_subjects(bill, page)
        self.scrape_bill_sponsors(bill, page)
        self.scrape_bill_actions(bill, page)

        # fiscal note
        if page.xpath('//dt[contains(text(), "Analysis")]/following-sibling::dd[1]/a'):
            fiscal_note = page.xpath(
                '//dt[contains(text(), "Analysis")]/following-sibling::dd[1]/a'
            )[0]
            fiscal_url = fiscal_note.get("href")
            fiscal_title = fiscal_note.text_content()
            bill.add_document_link(
                fiscal_title, fiscal_url, media_type="application/pdf",
            )

        # effective date, where available
        if page.xpath('//div[contains(text(), "Effective Date(s)")]'):
            eff_date = page.xpath('//div[contains(text(), "Effective Date(s)")]/text()')[0].strip()
            eff_date = eff_date.replace('Effective Date(s):', '').strip()
            # this can contain multiple dates, eg "July 1, 2020, July 1, 2022"
            bill.extras['date_effective'] = eff_date

        # yield from self.parse_bill_votes_new(doc, bill)
        yield bill
示例#5
0
    def scrape_bill(self, chamber, session, url):
        html = self.get(url).text
        page = lxml.html.fromstring(html)
        # search for Titulo, accent over i messes up lxml, so use 'tulo'
        title = page.xpath(
            '//span[@id="ctl00_CPHBody_txtTitulo"]/text()')[0].strip()
        bill_id = page.xpath(
            '//span[@id="ctl00_CPHBody_txt_Medida"]/text()')[0].strip()

        bill_type = self.classify_bill_type(bill_id)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )

        start_year = session[0:4]
        self.scrape_author_table(start_year, bill, bill_id)

        # action table contains votes, hence the yield
        yield from self.scrape_action_table(chamber, bill, page, url)

        bill.add_source(url)
        yield bill
示例#6
0
    def parse_bill(self, chamber, session, special, link):
        bill_num = link.text.strip()
        type_abbr = re.search("type=(B|R|)", link.attrib["href"]).group(1)

        if type_abbr == "B":
            btype = ["bill"]
        elif type_abbr == "R":
            btype = ["resolution"]

        bill_id = "%s%s %s" % (utils.bill_abbr(chamber), type_abbr, bill_num)

        url = utils.info_url(chamber, session, special, type_abbr, bill_num)
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        xpath = "/".join(
            [
                '//div[contains(@class, "BillInfo-ShortTitle")]',
                'div[@class="BillInfo-Section-Data"]',
            ]
        )

        if page.xpath(xpath):
            title = page.xpath(xpath).pop().text_content().strip()
        else:
            self.warning("Skipping {} {}, No title found".format(bill_id, url))
            return

        bill = Bill(
            bill_id,
            legislative_session=session,
            title=title,
            chamber=chamber,
            classification=btype,
        )
        bill.add_source(url)

        self.parse_bill_versions(bill, page)

        self.parse_history(
            bill,
            chamber,
            utils.history_url(chamber, session, special, type_abbr, bill_num),
        )

        # only fetch votes if votes were seen in history
        # if vote_count:
        yield from self.parse_votes(
            bill, utils.vote_url(chamber, session, special, type_abbr, bill_num)
        )

        # Dedupe sources.
        sources = bill.sources
        for source in sources:
            if 1 < sources.count(source):
                sources.remove(source)

        yield bill
示例#7
0
    def parse_bill(self, url):
        xml = self.get(url).content
        xml = ET.fromstring(xml)

        bill_num = self.get_xpath(xml, "bill/billNumber")
        bill_type = self.get_xpath(xml, "bill/billType")

        bill_id = "{} {}".format(bill_type, bill_num)

        chamber_name = self.get_xpath(xml, "bill/originChamber")
        chamber = self.chambers[chamber_name]

        title = self.get_xpath(xml, "bill/title")

        classification = self.classifications[bill_type]

        session = self.get_xpath(xml, "bill/congress")

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=classification,
        )

        self.scrape_actions(bill, xml)
        self.scrape_amendments(bill, xml, session, chamber, bill_id)
        self.scrape_cbo(bill, xml)
        self.scrape_committee_reports(bill, xml)
        self.scrape_cosponsors(bill, xml)
        self.scrape_laws(bill, xml)
        self.scrape_related_bills(bill, xml)
        self.scrape_sponsors(bill, xml)
        self.scrape_subjects(bill, xml)
        self.scrape_summaries(bill, xml)
        self.scrape_titles(bill, xml)
        self.scrape_versions(bill, xml)

        # https://www.congress.gov/bill/116th-congress/house-bill/1
        xml_url = "https://www.govinfo.gov/bulkdata/BILLSTATUS/{congress}/{type}/BILLSTATUS-{congress}{type}{num}.xml"
        bill.add_source(
            xml_url.format(congress=session,
                           type=bill_type.lower(),
                           num=bill_num))

        cg_url = (
            "https://congress.gov/bill/{congress}th-congress/{chamber}-{type}/{num}"
        )
        bill.add_source(
            cg_url.format(
                congress=session,
                chamber=chamber_name.lower(),
                type=classification.lower(),
                num=bill_num,
            ))

        yield bill
示例#8
0
    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/SessionInformation/LegislativeMeasures.txt"
        page = csv.reader(get_utf_16_ftp_content(url).splitlines(),
                          delimiter="|")

        for row in page:
            bill_chamber = {"H": "lower", "S": "upper"}[row[0]]

            if bill_chamber != chamber:
                continue
            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r"(H|S)([A-Z]+)\s", bill_id).group(2)
            bill_type = {
                "B": "bill",
                "R": "resolution",
                "JR": "joint resolution",
                "CR": "concurrent resolution",
                "MR": "memorial",
                "CMR": "concurrent memorial",
            }[type_spec]

            if row[-1] != self.slug:
                continue

            bill = Bill(
                bill_id,
                legislative_session=session,
                chamber=chamber,
                title=row[3],
                classification=bill_type,
            )
            bill.add_source(url)

            primary = row[11]
            if not primary:
                primary = row[12]

            if primary:
                bill.add_sponsorship(
                    primary,
                    classification="primary",
                    entity_type="person",
                    primary=True,
                )

            version_url = ("ftp://www.arkleg.state.ar.us/Bills/"
                           "%s/Public/Searchable/%s.pdf" %
                           (self.slug, bill_id.replace(" ", "")))
            bill.add_version_link(bill_id,
                                  version_url,
                                  media_type="application/pdf")

            yield from self.scrape_bill_page(bill)

            self.bills[bill_id] = bill
示例#9
0
    def scrape_bill(self, bill_page_url):
        bill_page = lxml.html.fromstring(self.get(bill_page_url).text)

        title = bill_page.xpath(
            '//span[@id="ctl00_ContentPlaceHolder_SubjectLabel"]/text()'
        )
        if title:
            title = title[0]
        else:
            self.warning("Missing bill title {}".format(bill_page_url))
            return False

        bill_no = bill_page.xpath(
            '//span[@id="ctl00_ContentPlaceHolder_BillNumberLabel"]/a/text()'
        )
        if bill_no:
            bill_no = bill_no[0]
        else:
            bill_no = bill_page.xpath(
                '//span[@id="ctl00_ContentPlaceHolder_BillNumberLabel"]/text()'
            )
            if bill_no:
                bill_no = bill_no[0]
            else:
                self.error("Missing bill number {}".format(bill_page_url))
                return False

        bill = Bill(
            bill_no,
            legislative_session=self.session,
            chamber="legislature",
            title=title,
            classification="bill",
        )

        bill.add_source(bill_page_url)

        self.parse_versions(bill, bill_page, bill_no)
        self.parse_acts(bill, bill_page)

        sponsors = bill_page.xpath(
            '//span[@id="ctl00_ContentPlaceHolder_SponsorsLabel"]/text()'
        )
        if sponsors:
            self.assign_sponsors(bill, sponsors[0], "primary")

        cosponsors = bill_page.xpath(
            '//span[@id="ctl00_ContentPlaceHolder_CoSponsorsLabel"]/text()'
        )
        if cosponsors:
            self.assign_sponsors(bill, cosponsors[0], "cosponsor")

        self.parse_date_actions(bill, bill_page)
        self.parse_actions(bill, bill_page)

        yield bill
示例#10
0
 def get_bill(self, bill_id, **kwargs):
     if bill_id == "1":
         assert kwargs == {"extra": "param"}
         raise self.ContinueScraping
     else:
         assert bill_id == "2"
         assert kwargs == {}
         b = Bill("1", self.legislative_session, "title")
         b.add_source("http://example.com")
         return b
def toy_bill():
    b = Bill(
        identifier="HB 2017",
        legislative_session="2012A",
        title="A bill for an act to raise the cookie budget by 200%",
        from_organization="Foo Senate",
        classification="bill",
    )
    b.add_source("http://uri.example.com/", note="foo")
    return b
示例#12
0
def test_from_organization():
    # none set
    assert get_pseudo_id(Bill("HB 1", "2014", "Some Bill").from_organization) == {
        "classification": "legislature"
    }

    # chamber set
    assert get_pseudo_id(
        Bill("SB 1", "2014", "Some Bill", chamber="upper").from_organization
    ) == {"classification": "upper"}
    # org direct set
    assert (
        Bill("HB 1", "2014", "Some Bill", from_organization="test").from_organization
        == "test"
    )

    # can't set both
    with pytest.raises(ValueError):
        Bill("HB 1", "2014", "Some Bill", from_organization="upper", chamber="upper")
示例#13
0
    def scrape_bill_info(self, session, chambers):
        info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv"
        data = self.get(info_url)
        page = open_csv(data)

        chamber_map = {"H": "lower", "S": "upper"}

        for row in page:
            if row["sess_year"] != session:
                continue
            bill_id = row["bill_num"]
            chamber = chamber_map[bill_id[0]]

            if chamber not in chambers:
                continue

            if re.match(r"^(S|H)J", bill_id):
                bill_type = "joint resolution"
            elif re.match(r"^(S|H)R", bill_id):
                bill_type = "resolution"
            else:
                bill_type = "bill"

            bill = Bill(
                identifier=bill_id,
                legislative_session=session,
                title=row["bill_title"],
                classification=bill_type,
                chamber=chamber,
            )
            bill.add_source(info_url)

            for introducer in self._introducers[bill_id]:
                introducer = string.capwords(
                    introducer.decode("utf-8").replace("Rep. ", "").replace("Sen. ", "")
                )
                if "Dist." in introducer:
                    introducer = " ".join(introducer.split()[:-2])
                bill.add_sponsorship(
                    name=introducer,
                    classification="primary",
                    primary=True,
                    entity_type="person",
                )

            try:
                for subject in self._subjects[bill_id]:
                    bill.subject.append(subject)

                self.bills[bill_id] = [bill, chamber]

                yield from self.scrape_bill_page(bill)
            except SkipBill:
                self.warning("no such bill: " + bill_id)
                pass
示例#14
0
class HouseSearchPage(HtmlListPage):
    """
    House committee roll calls are not available on the Senate's
    website. Furthermore, the House uses an internal ID system in
    its URLs, making accessing those pages non-trivial.

    This will fetch all the House committee votes for the
    given bill, and add the votes to that object.
    """

    input_type = Bill
    example_input = Bill("HB 1",
                         "2020",
                         "title",
                         chamber="upper",
                         classification="bill")
    selector = XPath(
        '//a[contains(@href, "/Bills/billsdetail.aspx?BillId=")]/@href')

    def get_source_from_input(self):
        url = "https://www.myfloridahouse.gov/Sections/Bills/bills.aspx"
        # Keep the digits and all following characters in the bill's ID
        bill_number = re.search(r"^\w+\s(\d+\w*)$",
                                self.input.identifier).group(1)
        session_number = {
            "2022D": "96",
            "2022C": "95",
            "2022": "93",
            "2021B": "94",
            "2021A": "92",
            "2021": "90",
            "2020": "89",
            "2019": "87",
            "2018": "86",
            "2017A": "85",
            "2017": "83",
            "2016": "80",
            "2015C": "82",
            "2015B": "81",
            "2015A": "79",
            "2015": "76",
            "2014O": "78",
            "2014A": "77",
            "2016O": "84",
        }[self.input.legislative_session]

        form = {
            "Chamber": "B",
            "SessionId": session_number,
            "BillNumber": bill_number
        }
        return url + "?" + urlencode(form)

    def process_item(self, item):
        return HouseBillPage(self.input, source=item)
示例#15
0
class HouseBillPage(HtmlListPage):
    selector = XPath('//a[text()="See Votes"]/@href', min_items=0)
    example_input = Bill(
        "HB 1", "2020", "title", chamber="upper", classification="bill"
    )
    example_source = (
        "https://www.myfloridahouse.gov/Sections/Bills/billsdetail.aspx?BillId=69746"
    )

    def process_item(self, item):
        return HouseComVote(self.input, source=item)
示例#16
0
    def _recursively_process_bills(
        self, request_session, chamber, session, first_item=1
    ):
        """
        Once a search has been initiated, this function will save a
        Bill object for every Paper from the given chamber
        """

        url = "http://legislature.maine.gov/LawMakerWeb/searchresults.asp"
        r = request_session.get(url, params={"StartWith": first_item})
        r.raise_for_status()

        bills = lxml.html.fromstring(r.text).xpath("//tr/td/b/a")
        seen = set()
        if bills:
            for bill in bills:
                bill_id_slug = bill.xpath("./@href")[0]
                if bill_id_slug == "summary.asp?ID=280068396":
                    continue
                bill_url = "http://legislature.maine.gov/LawMakerWeb/{}".format(
                    bill_id_slug
                )
                bill_id = bill.text[:2] + " " + bill.text[2:]

                if (
                    session in BLACKLISTED_BILL_IDS
                    and bill_id in BLACKLISTED_BILL_IDS[session]
                ):
                    continue

                # avoid duplicates
                if bill_id in seen:
                    continue
                seen.add(bill_id)

                bill = Bill(
                    identifier=bill_id,
                    legislative_session=session,
                    title="",
                    chamber=chamber,
                )
                bill.add_source(bill_url)

                yield from self.scrape_bill(bill, chamber)
                yield bill

            # Make a recursive call to this function, for the next page
            PAGE_SIZE = 25
            yield from self._recursively_process_bills(
                request_session=request_session,
                chamber=chamber,
                session=session,
                first_item=first_item + PAGE_SIZE,
            )
示例#17
0
    def process_page(self):
        chamber = "upper" if self.input.identifier.startswith("S") else "lower"
        short_title = self.get_column_div("Summary").text
        long_title = CSS("#title").match_one(self.root).text

        if "*" in self.input.identifier:
            stars = re.search(r"\*+", self.input.identifier).group()
            if (
                self.input.session in CARRYOVERS
                and stars in CARRYOVERS[self.input.session]
            ):
                self.input.identifier = re.sub(
                    r"\*+",
                    "-" + CARRYOVERS[self.input.session][stars],
                    self.input.identifier,
                )
            else:
                self.logger.error(
                    f"Unidentified carryover bill {self.input.identifier}. Update CARRYOVERS dict in bills.py"
                )
                return

        bill = Bill(
            identifier=self.input.identifier,
            legislative_session=self.input.session,
            title=short_title,
            chamber=chamber,
        )
        bill.subject = self.input.subjects
        # use the pretty source URL
        bill.add_source(self.input.source_url)
        bill.add_title(long_title)

        try:
            sponsors = self.get_column_div("Primary Sponsor")
            self.add_sponsors(bill, CSS("a").match(sponsors), primary=True)
        except SelectorError:
            pass
        try:
            cosponsors = self.get_column_div("Co-Sponsor")
            self.add_sponsors(bill, CSS("a").match(cosponsors), primary=False)
        except SelectorError:
            pass
        # TODO: figure out cosponsor div name, can't find any as of Feb 2021
        self.add_actions(bill, chamber)

        bdr = extract_bdr(short_title)
        if bdr:
            bill.extras["BDR"] = bdr

        text_url = self.source.url.replace("Overview", "Text")
        yield BillTabText(bill, source=text_url)
示例#18
0
def test_save_object_basics():
    # ensure that save object dumps a file
    s = Scraper(juris, "/tmp/")
    p = Bill("HB 1", "2021", "Test")
    p.add_source("http://example.com")

    with mock.patch("json.dump") as json_dump:
        s.save_object(p)

    # ensure object is saved in right place
    filename = "bill_" + p._id + ".json"
    assert filename in s.output_names["bill"]
    json_dump.assert_called_once_with(p.as_dict(), mock.ANY, cls=mock.ANY)
示例#19
0
def test_whitespace_is_stripped():
    s = Scraper(juris, "/tmp/")
    b = Bill(" HB 11", "2020", " a short title     ")
    b.subject = [" one", "two ", "   three "]
    b.add_source("https://example.com/     ")

    s.save_object(b)

    # the simple cases, and nested lists / objects
    assert b.identifier == "HB 11"
    assert b.title == "a short title"
    assert b.sources[0]["url"] == "https://example.com/"
    assert b.subject == ["one", "two", "three"]
示例#20
0
    def scrape_bill(self, session, session_slug, chamber, url):
        page = lxml.html.fromstring(self.get(url).text)
        bill_no = page.xpath('//*[@id="item-header"]/text()')[0].strip()
        # state bill id
        internal_id = re.search(r"\/Bill\/(\d+)\/Overview", url).group(1)

        # bill data gets filled in from another call
        bill_data_base = (
            "https://www.leg.state.nv.us/App/NELIS/REL/{}/Bill/"
            "FillSelectedBillTab?selectedTab=Overview&billKey={}&_={}")
        bill_data_url = bill_data_base.format(session_slug, internal_id,
                                              time.time() * 1000)

        bill_page = lxml.html.fromstring(self.get(bill_data_url).text)

        short_title = self.get_header_field(bill_page, "Summary:").text
        short_title = short_title.replace("\u00a0", " ")

        bill = Bill(
            identifier=bill_no,
            legislative_session=session,
            title=short_title,
            chamber=chamber,
        )

        long_title = self.get_header_field(bill_page, "Title:").text
        if long_title is not None:
            bill.add_abstract(long_title, "Summary")

        sponsor_div = self.get_header_field(bill_page, "Primary Sponsor")
        if sponsor_div is not None:
            self.add_sponsors(sponsor_div, bill, "primary")

        cosponsor_div = self.get_header_field(bill_page, "Co-Sponsor")
        if cosponsor_div is not None:
            self.add_sponsors(cosponsor_div, bill, "cosponsor")

        self.add_actions(bill_page, bill, chamber)
        self.add_versions(session_slug, internal_id, bill)

        bill.subject = list(set(self.subject_mapping[bill_no]))

        bdr = self.extract_bdr(short_title)
        if bdr:
            bill.extras["BDR"] = bdr

        bill.extras["NV_ID"] = internal_id

        bill.add_source(url)
        yield bill
示例#21
0
    def handle_page(self):
        bills = self.doc.xpath('//ul[@class="linkSect"]/li')
        for bill in bills:
            link = bill.getchildren()[0]
            bill_id = str(link.text_content())

            if not bill_id.startswith(("S", "H")):
                continue

            # create a bill
            desc = bill.xpath("text()")[0].strip()
            chamber = {"H": "lower", "S": "upper"}[bill_id[0]]
            bill_type = {
                "B": "bill",
                "J": "joint resolution",
                "R": "resolution"
            }[bill_id[1]]
            bill = Bill(
                bill_id,
                self.kwargs["session"],
                desc,
                chamber=chamber,
                classification=bill_type,
            )

            bill_url = link.get("href")
            sponsor_url = BASE_URL + URL_PATTERNS["sponsors"].format(
                self.kwargs["session_id"], bill_id.replace(" ", ""))

            list(
                self.scrape_page_items(BillSponsorPage,
                                       url=sponsor_url,
                                       obj=bill))
            yield from self.scrape_page_items(BillDetailPage,
                                              url=bill_url,
                                              obj=bill)
            bill.subject = self.kwargs["subjects"][bill_id]
            bill.add_source(bill_url)
            yield bill

        next_url = self.doc.xpath('//a/b[text()="More..."]/../@href')
        if next_url:
            yield from self.scrape_page_items(BillListPage,
                                              url=next_url[0],
                                              **self.kwargs)
示例#22
0
    def scrape_bill(self, chamber, session, bill_id, session_id):
        bill_json_url = (
            "https://apps.azleg.gov/api/Bill/?billNumber={}&sessionId={}&"
            "legislativeBody={}".format(bill_id, session_id, self.chamber_map[chamber])
        )
        response = self.get(bill_json_url, timeout=80)
        page = json.loads(response.content.decode("utf-8"))

        if not page:
            self.warning("null page for %s", bill_id)
            return

        bill_title = page["ShortTitle"]
        bill_id = page["Number"]
        internal_id = page["BillId"]
        bill_type = self.get_bill_type(bill_id)
        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=bill_title,
            classification=bill_type,
        )

        self.scrape_actions(bill, page, chamber)
        self.scrape_versions_and_documents(bill, internal_id)
        self.scrape_sponsors(bill, internal_id)
        self.scrape_subjects(bill, internal_id)
        yield from self.scrape_votes(bill, page)

        bill_url = (
            "https://apps.azleg.gov/BillStatus/BillOverview/{}?SessionId={}".format(
                internal_id, session_id
            )
        )
        bill.add_source(bill_url)

        bill.actions = sorted(bill.actions, key=lambda action: action["date"])

        yield bill
示例#23
0
    def handle_list_item(self, item):
        bill_id = item.text.strip()
        title = item.xpath("string(../following-sibling::td[1])").strip()
        sponsor = item.xpath("string(../following-sibling::td[2])").strip()
        bill_url = item.attrib["href"] + "/ByCategory"

        if bill_id.startswith(("SB ", "HB ", "SPB ", "HPB ")):
            bill_type = "bill"
        elif bill_id.startswith(("HR ", "SR ")):
            bill_type = "resolution"
        elif bill_id.startswith(("HJR ", "SJR ")):
            bill_type = "joint resolution"
        elif bill_id.startswith(("SCR ", "HCR ")):
            bill_type = "concurrent resolution"
        elif bill_id.startswith(("SM ", "HM ")):
            bill_type = "memorial"
        else:
            raise ValueError("Failed to identify bill type.")

        bill = Bill(
            bill_id,
            self.kwargs["session"],
            title,
            chamber="lower" if bill_id[0] == "H" else "upper",
            classification=bill_type,
        )
        bill.add_source(bill_url)

        # normalize id from HB 0004 to H4
        subj_bill_id = re.sub(r"(H|S)\w+ 0*(\d+)", r"\1\2", bill_id)
        bill.subject = list(self.kwargs["subjects"][subj_bill_id])

        sponsor = re.sub(r"^(?:Rep|Sen)\.\s", "", sponsor)
        for sp in sponsor.split(", "):
            sp = sp.strip()
            bill.add_sponsorship(sp, "primary", "person", True)

        yield from self.scrape_page_items(BillDetail, url=bill_url, obj=bill)

        yield bill
示例#24
0
    def process_item(self, item):
        bill_id = item.text.strip()
        title = item.xpath("string(../following-sibling::td[1])").strip()
        sponsor = item.xpath("string(../following-sibling::td[2])").strip()
        bill_url = item.attrib["href"] + "/ByCategory"

        if bill_id.startswith(("SB ", "HB ", "SPB ", "HPB ")):
            bill_type = "bill"
        elif bill_id.startswith(("HR ", "SR ")):
            bill_type = "resolution"
        elif bill_id.startswith(("HJR ", "SJR ")):
            bill_type = "joint resolution"
        elif bill_id.startswith(("SCR ", "HCR ")):
            bill_type = "concurrent resolution"
        elif bill_id.startswith(("SM ", "HM ")):
            bill_type = "memorial"
        else:
            raise ValueError("Failed to identify bill type.")

        bill = Bill(
            bill_id,
            self.input["session"],
            title,
            chamber="lower" if bill_id[0] == "H" else "upper",
            classification=bill_type,
        )
        bill.add_source(bill_url)

        # normalize id from HB 0004 to H4
        subj_bill_id = re.sub(r"(H|S)\w+ 0*(\d+)", r"\1\2", bill_id)
        bill.subject = list(self.subjects[subj_bill_id])

        sponsor = re.sub(r"^(?:Rep|Sen)\.\s", "", sponsor)
        sponsor = re.sub(r",\s+(Jr|Sr)\.", r" \1.", sponsor)
        for sp in sponsor.split(", "):
            sp = sp.strip()
            sp_type = "organization" if "committee" in sp.lower() else "person"
            bill.add_sponsorship(sp, "primary", sp_type, True)

        return BillDetail(bill)
示例#25
0
    def scrape_prefiles(self, session):
        url = 'https://www.legis.iowa.gov/legislation/billTracking/prefiledBills'
        page = lxml.html.fromstring(self.get(url).content)
        page.make_links_absolute(url)

        for row in page.xpath('//table[contains(@class, "sortable")]/tr[td]'):
            title = row.xpath('td[2]/a/text()')[0].strip()
            url = row.xpath('td[2]/a/@href')[0]

            bill_id = self.extract_doc_id(title)

            bill = Bill(
                bill_id,
                legislative_session=session,
                chamber='legislature',
                title=title,
                classification='proposed bill',
            )

            if (row.xpath('td[3]/a')):
                document_url = row.xpath('td[3]/a/@href')[0]
                if '.docx' in document_url:
                    media_type = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
                elif '.pdf' in document_url:
                    media_type = 'application/pdf'
                bill.add_document_link(
                    note="Backround Statement",
                    url=document_url,
                    media_type=media_type
                )

            bill.add_version_link(
                note="Prefiled",
                url=url,
                media_type="application/pdf"
            )

            bill.add_source(url)

            yield bill
示例#26
0
    def scrape_bill_list(self, chamber, session, url):
        if "joint_resolution" in url:
            bill_type = "joint resolution"
        elif "resolution" in url:
            bill_type = "resolution"
        elif "bill" in url:
            bill_type = "bill"

        try:
            data = self.get(url).text
        except scrapelib.HTTPError:
            self.warning("skipping URL %s" % url)
            return
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(url)
        bill_list = doc.xpath('//ul[@class="infoLinks"]/li/div[@class="row-fluid"]')
        for b in bill_list:
            bill_url = b.xpath('./div[@class="span3"]/a/@href')[0]
            bill_id = bill_url.rsplit("/", 1)[-1]
            bill_id = bill_id.upper()

            title = (
                b.xpath('./div[@class="span6"]/text()')[0]
                .replace(" - Relating to: ", "")
                .strip()
            )

            bill = Bill(
                bill_id,
                legislative_session=session,
                title=title,
                chamber=chamber,
                classification=bill_type,
            )
            bill.subject = list(set(self.subjects[bill_id]))
            yield from self.scrape_bill_history(bill, bill_url, chamber)

            yield bill
示例#27
0
    def scrape_bill(self, session, chamber, bill_url):

        try:
            page = self.lxmlize("{}{}".format(CO_URL_BASE, bill_url))
        except scrapelib.HTTPError as e:
            if e.response.status_code == 503:
                self.error("Skipping %s w/ 503", bill_url)
                return
            else:
                raise

        bill_number = page.xpath(
            '//div[contains(@class,"field-name-field-bill-number")]'
            '//div[contains(@class,"field-item even")][1]/text()'
        )[0].strip()

        bill_title = page.xpath('//span[@property="dc:title"]/@content')[0]

        bill_summary = page.xpath(
            'string(//div[contains(@class,"field-name-field-bill-summary")])'
        )
        bill_summary = bill_summary.replace("Read More", "").strip()
        bill = Bill(
            bill_number, legislative_session=session, chamber=chamber, title=bill_title
        )
        if bill_summary:
            bill.add_abstract(bill_summary, "summary")
        bill.add_source("{}{}".format(CO_URL_BASE, bill_url))

        self.scrape_sponsors(bill, page)
        self.scrape_actions(bill, page)
        self.scrape_versions(bill, page)
        self.scrape_research_notes(bill, page)
        self.scrape_fiscal_notes(bill, page)
        self.scrape_committee_report(bill, page)
        self.scrape_amendments(bill, page)
        yield bill
        yield from self.scrape_votes(session, bill, page)
示例#28
0
    def parse_bill(self, chamber, session, bill_id, url):
        try:
            page = self.lxmlize(url)
        except scrapelib.HTTPError as e:
            self.logger.warning(e)
            return

        if self.parse_bill_field(page, "Last Action") != "":
            last_action = self.parse_bill_field(
                page, "Last Action").xpath("text()")[0]
            if "WITHDRAWN" in last_action.upper():
                self.info("{} Withdrawn, skipping".format(bill_id))
                return

        title = self.parse_bill_field(page, "Title").text_content()

        if "CR" in bill_id:
            bill_type = "concurrent resolution"
        elif "JR" in bill_id:
            bill_type = "joint resolution"
        elif "R" in bill_id:
            bill_type = "resolution"
        else:
            bill_type = "bill"

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )
        bill.subject = self._subjects[bill_id]
        bill.add_source(url)

        version_ct = self.parse_versions(page, bill)

        if version_ct < 1:
            # Bill withdrawn
            self.logger.warning("Bill withdrawn.")
            return

        self.parse_actions(page, bill, chamber)
        self.parse_subjects(page, bill)
        self.parse_proposed_amendments(page, bill)

        # LM is "Locally Mandated fiscal impact"
        fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]')
        for fiscal_note in fiscal_notes:
            source_url = fiscal_note.attrib["href"]
            mimetype = get_media_type(source_url)

            bill.add_document_link("Fiscal Note",
                                   source_url,
                                   media_type=mimetype)

        for link in page.xpath(
                "//td/span/a[contains(@href, 'Legislator-Profile')]"):
            bill.add_sponsorship(
                link.text.strip(),
                classification="primary",
                entity_type="person",
                primary=True,
            )

        if page.xpath("//th[contains(text(),'Votes')]"):
            vote_url = page.xpath(
                "//a[contains(text(),'Vote History')]/@href")[0]
            yield from self.scrape_votes(vote_url, bill, chamber)

        bdr_no = self.parse_bill_field(page, "Bill Request Number")
        if bdr_no != "" and bdr_no.xpath("text()"):
            bdr = bdr_no.xpath("text()")[0].strip()
            bill.extras["BDR"] = bdr

        yield bill
    def scrape_bill(self, chamber, session, bill_id, title, url):
        page = self.get(url).json()
        api_id = page["BillId"]

        if re.match(r"^(S|H)B ", bill_id):
            btype = ["bill"]
        elif re.match(r"(S|H)C ", bill_id):
            btype = ["commemoration"]
        elif re.match(r"(S|H)JR ", bill_id):
            btype = ["joint resolution"]
        elif re.match(r"(S|H)CR ", bill_id):
            btype = ["concurrent resolution"]
        else:
            btype = ["bill"]

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=btype,
        )
        bill.add_source(f"https://sdlegislature.gov/Session/Bill/{api_id}")
        bill.add_source(url)

        version_rows = page["Documents"]
        assert len(version_rows) > 0
        for version in version_rows:
            date = version["DocumentDate"]
            if date:
                match = re.match(r"\d{4}-\d{2}-\d{2}", date)
                date = datetime.datetime.strptime(match.group(0),
                                                  "%Y-%m-%d").date()

                html_link = f"https://sdlegislature.gov/Session/Bill/{api_id}/{version['DocumentId']}"
                pdf_link = f"https://mylrc.sdlegislature.gov/api/Documents/{version['DocumentId']}.pdf"

                note = version["BillVersion"]
                bill.add_version_link(
                    note,
                    html_link,
                    date=date,
                    media_type="text/html",
                    on_duplicate="ignore",
                )
                bill.add_version_link(
                    note,
                    pdf_link,
                    date=date,
                    media_type="application/pdf",
                    on_duplicate="ignore",
                )
            else:
                self.warning("Version listed but no date or documents")

        sponsors = page["BillSponsor"]
        if sponsors:
            for sponsor in sponsors:
                sponsor_type = "person"
                member = sponsor["Member"]
                # first and last name are available, but UniqueName is the old link text
                # could change later?

                bill.add_sponsorship(
                    member["UniqueName"],
                    classification="primary",
                    primary=True,
                    entity_type=sponsor_type,
                )
        else:
            sponsor_type = "organization"
            committee_sponsor = re.search(r">(.*)</a>",
                                          page["BillCommitteeSponsor"])[1]
            bill.add_sponsorship(
                committee_sponsor,
                classification="primary",
                primary=True,
                entity_type=sponsor_type,
            )

        for keyword in page["Keywords"]:
            bill.add_subject(keyword["Keyword"]["Keyword"])

        actions_url = f"https://sdlegislature.gov/api/Bills/ActionLog/{api_id}"
        yield from self.scrape_action(bill, actions_url, chamber)

        yield bill
示例#30
0
    def scrape_bill(self, session, history_url):
        history_xml = self.get(history_url).text
        root = etree.fromstring(history_xml)

        bill_title = root.findtext("caption")
        if bill_title is None or "Bill does not exist" in history_xml:
            self.warning("Bill does not appear to exist")
            return
        bill_id = " ".join(root.attrib["bill"].split(" ")[1:])

        chamber = self.CHAMBERS[bill_id[0]]

        if bill_id[1] == "B":
            bill_type = ["bill"]
        elif bill_id[1] == "R":
            bill_type = ["resolution"]
        elif bill_id[1:3] == "CR":
            bill_type = ["concurrent resolution"]
        elif bill_id[1:3] == "JR":
            bill_type = ["joint resolution"]
        else:
            raise ScrapeError("Invalid bill_id: %s" % bill_id)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=bill_title,
            classification=bill_type,
        )

        bill.add_source(history_url)

        bill_id_for_url = bill_id.replace(" ", "")
        bill.add_source(
            f"https://capitol.texas.gov/BillLookup/History.aspx?LegSess={session}&Bill={bill_id_for_url}"
        )

        for subject in root.iterfind("subjects/subject"):
            bill.add_subject(subject.text.strip())

        for version in root.iterfind(
                "billtext/docTypes/bill/versions/version"):
            if not version:
                continue

            note = version.find("versionDescription").text
            html_url = version.find("WebHTMLURL").text
            bill.add_version_link(note=note,
                                  url=html_url,
                                  media_type="text/html")
            pdf_url = version.find("WebPDFURL").text
            bill.add_version_link(note=note,
                                  url=pdf_url,
                                  media_type="application/pdf")

        for analysis in root.iterfind(
                "billtext/docTypes/analysis/versions/version"):
            if not analysis:
                continue

            description = analysis.find("versionDescription").text
            html_url = analysis.find("WebHTMLURL").text
            bill.add_document_link(
                note="Analysis ({})".format(description),
                url=html_url,
                media_type="text/html",
            )

        for fiscal_note in root.iterfind(
                "billtext/docTypes/fiscalNote/versions/version"):
            if not fiscal_note:
                continue

            description = fiscal_note.find("versionDescription").text
            html_url = fiscal_note.find("WebHTMLURL").text
            bill.add_document_link(
                note="Fiscal Note ({})".format(description),
                url=html_url,
                media_type="text/html",
            )

        witnesses = [x for x in self.witnesses if x[0] == bill_id]
        for witness in witnesses:
            bill.add_document_link(
                note="Witness List ({})".format(
                    self.NAME_SLUGS[witness[1][-5]]),
                url=witness[1],
                media_type="text/html",
            )

        for action in root.findall("actions/action"):
            act_date = datetime.datetime.strptime(action.findtext("date"),
                                                  "%m/%d/%Y").date()

            action_number = action.find("actionNumber").text
            actor = {
                "H": "lower",
                "S": "upper",
                "E": "executive"
            }[action_number[0]]

            desc = action.findtext("description").strip()

            if desc == "Scheduled for public hearing on . . .":
                self.warning("Skipping public hearing action with no date")
                continue

            atype = _categorize_action(desc)

            act = bill.add_action(
                action.findtext("description"),
                act_date,
                chamber=actor,
                classification=atype,
            )

            if atype and "referral-committee" in atype:
                repls = ["Referred to", "Recommended to be sent to "]
                ctty = desc
                for r in repls:
                    ctty = ctty.replace(r, "").strip()
                act.add_related_entity(name=ctty, entity_type="organization")

        for author in root.findtext("authors").split(" | "):
            if author != "":
                bill.add_sponsorship(author,
                                     classification="primary",
                                     entity_type="person",
                                     primary=True)
        for coauthor in root.findtext("coauthors").split(" | "):
            if coauthor != "":
                bill.add_sponsorship(
                    coauthor,
                    classification="cosponsor",
                    entity_type="person",
                    primary=False,
                )
        for sponsor in root.findtext("sponsors").split(" | "):
            if sponsor != "":
                bill.add_sponsorship(
                    sponsor,
                    classification="primary",
                    entity_type="person",
                    primary=True,
                )
        for cosponsor in root.findtext("cosponsors").split(" | "):
            if cosponsor != "":
                bill.add_sponsorship(
                    cosponsor,
                    classification="cosponsor",
                    entity_type="person",
                    primary=False,
                )

        if root.findtext("companions"):
            self._get_companion(bill)

        yield bill