Пример #1
0
    def process_page(self):
        chamber = "upper" if self.input.identifier.startswith("S") else "lower"
        short_title = self.get_column_div("Summary").text
        long_title = CSS("#title").match_one(self.root).text

        if "*" in self.input.identifier:
            stars = re.search(r"\*+", self.input.identifier).group()
            if (
                self.input.session in CARRYOVERS
                and stars in CARRYOVERS[self.input.session]
            ):
                self.input.identifier = re.sub(
                    r"\*+",
                    "-" + CARRYOVERS[self.input.session][stars],
                    self.input.identifier,
                )
            else:
                self.logger.error(
                    f"Unidentified carryover bill {self.input.identifier}. Update CARRYOVERS dict in bills.py"
                )
                return

        bill = Bill(
            identifier=self.input.identifier,
            legislative_session=self.input.session,
            title=short_title,
            chamber=chamber,
        )
        bill.subject = self.input.subjects
        # use the pretty source URL
        bill.add_source(self.input.source_url)
        bill.add_title(long_title)

        try:
            sponsors = self.get_column_div("Primary Sponsor")
            self.add_sponsors(bill, CSS("a").match(sponsors), primary=True)
        except SelectorError:
            pass
        try:
            cosponsors = self.get_column_div("Co-Sponsor")
            self.add_sponsors(bill, CSS("a").match(cosponsors), primary=False)
        except SelectorError:
            pass
        # TODO: figure out cosponsor div name, can't find any as of Feb 2021
        self.add_actions(bill, chamber)

        bdr = extract_bdr(short_title)
        if bdr:
            bill.extras["BDR"] = bdr

        text_url = self.source.url.replace("Overview", "Text")
        yield BillTabText(bill, source=text_url)
Пример #2
0
    def _parse_house_bill(self, url, session):
        # using the print page makes the page simpler, and also *drastically* smaller
        # (8k rather than 100k)
        url = re.sub("billsummary", "billsummaryprn", url)
        url = "%s/%s" % (self._house_base_url, url)

        # the URL is an iframed version now, so swap in for the actual bill page

        url = url.replace("Bill.aspx", "BillContent.aspx")
        url = url.replace("&code=R", "&code=R&style=new")

        # http://www.house.mo.gov/Bill.aspx?bill=HB26&year=2017&code=R
        # http://www.house.mo.gov/BillContent.aspx?bill=HB26&year=2017&code=R&style=new

        bill_page = self.get(url).text
        bill_page = lxml.html.fromstring(bill_page)
        bill_page.make_links_absolute(url)

        bill_id = bill_page.xpath('//*[@class="entry-title"]/div')
        if len(bill_id) == 0:
            self.info("WARNING: bill summary page is blank! (%s)" % url)
            self._bad_urls.append(url)
            return
        bill_id = bill_id[0].text_content()
        bill_id = clean_text(bill_id)

        bill_desc = bill_page.xpath(
            '//*[@class="BillDescription"]')[0].text_content()
        bill_desc = clean_text(bill_desc)

        table_rows = bill_page.xpath("//table/tr")
        # if there is a cosponsor all the rows are pushed down one for the extra row
        # for the cosponsor:
        cosponsorOffset = 0
        if table_rows[2][0].text_content().strip() == "Co-Sponsor:":
            cosponsorOffset = 1

        lr_label_tag = table_rows[3 + cosponsorOffset]
        assert lr_label_tag[0].text_content().strip() == "LR Number:"
        # bill_lr = lr_label_tag[1].text_content()

        lastActionOffset = 0
        if (table_rows[4 + cosponsorOffset][0].text_content().strip() ==
                "Governor Action:"):
            lastActionOffset = 1
        official_title_tag = table_rows[5 + cosponsorOffset + lastActionOffset]
        assert official_title_tag[0].text_content().strip() == "Bill String:"
        official_title = official_title_tag[1].text_content()

        # could substitute the description for the name,
        # but keeping it separate for now.

        bill_type = "bill"
        triplet = bill_id[:3]

        if triplet in bill_types:
            bill_type = bill_types[triplet]
            bill_number = int(bill_id[3:].strip())
        else:
            bill_number = int(bill_id[3:])

        subs = []
        bid = bill_id.replace(" ", "")

        if bid in self._subjects:
            subs = self._subjects[bid]
            self.info("With subjects for this bill")

        self.info(bid)

        if bill_desc == "":
            if bill_number <= 20:
                # blank bill titles early in session are approp. bills
                bill_desc = "Appropriations Bill"
            else:
                self.error("Blank title. Skipping. {} / {} / {}".format(
                    bill_id, bill_desc, official_title))
                return

        bill = Bill(
            bill_id,
            chamber="lower",
            title=bill_desc,
            legislative_session=self._session_id,
            classification=bill_type,
        )
        bill.subject = subs
        bill.add_title(official_title, note="official")

        bill.add_source(url)

        bill_sponsor = clean_text(table_rows[0][1].text_content())
        # try:
        #     bill_sponsor_link = table_rows[0][1][0].attrib['href']
        # except IndexError:
        #     return
        bill.add_sponsorship(bill_sponsor,
                             entity_type="person",
                             classification="primary",
                             primary=True)

        # check for cosponsors
        (sponsors_url,
         ) = bill_page.xpath("//a[contains(@href, 'CoSponsors.aspx')]/@href")
        self._parse_cosponsors_from_bill(bill, sponsors_url)

        # actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0]
        # actions_link = '%s/%s' % (self._house_base_url,actions_link_tag.attrib['href'])
        # actions_link = re.sub("content", "print", actions_link)

        (actions_link,
         ) = bill_page.xpath("//a[contains(@href, 'BillActions.aspx')]/@href")
        yield from self._parse_house_actions(bill, actions_link)

        # get bill versions
        doc_tags = bill_page.xpath('//div[@class="BillDocuments"][1]/span')
        for doc_tag in reversed(doc_tags):
            doc = clean_text(doc_tag.text_content())
            text_url = "%s%s" % (self._house_base_url,
                                 doc_tag[0].attrib["href"])
            bill.add_document_link(doc, text_url, media_type="text/html")

        # get bill versions
        version_tags = bill_page.xpath('//div[@class="BillDocuments"][2]/span')
        for version_tag in reversed(version_tags):
            version = clean_text(version_tag.text_content())
            for vurl in version_tag.xpath(".//a"):
                if vurl.text == "PDF":
                    mimetype = "application/pdf"
                else:
                    mimetype = "text/html"
                bill.add_version_link(
                    version,
                    vurl.attrib["href"],
                    media_type=mimetype,
                    on_duplicate="ignore",
                )

        # house bill versions
        # everything between the row containing "Bill Text" in an h2 and the next div.DocHeaderRow
        version_rows = bill_page.xpath(
            '//div[h2[contains(text(),"Bill Text")]]/'
            'following-sibling::div[contains(@class,"DocRow") '
            'and count(preceding-sibling::div[contains(@class,"DocHeaderRow")])=1]'
        )
        for row in version_rows:
            # some rows are just broken links, not real versions
            if row.xpath('.//div[contains(@class,"textType")]/a/@href'):
                version = row.xpath(
                    './/div[contains(@class,"textType")]/a/text()')[0].strip()
                path = row.xpath(
                    './/div[contains(@class,"textType")]/a/@href')[0].strip()
                if ".pdf" in path:
                    mimetype = "application/pdf"
                else:
                    mimetype = "text/html"
                bill.add_version_link(version,
                                      path,
                                      media_type=mimetype,
                                      on_duplicate="ignore")

        # house bill summaries
        # everything between the row containing "Bill Summary" in an h2
        # and the next div.DocHeaderRow
        summary_rows = bill_page.xpath(
            '//div[h2[contains(text(),"Bill Summary")]]/'
            'following-sibling::div[contains(@class,"DocRow") '
            'and count(following-sibling::div[contains(@class,"DocHeaderRow")])=1]'
        )

        # if there are no amedments, we need a different xpath for summaries
        if not summary_rows:
            summary_rows = bill_page.xpath(
                '//div[h2[contains(text(),"Bill Summary")]]/'
                'following-sibling::div[contains(@class,"DocRow")]')

        for row in reversed(summary_rows):
            version = row.xpath(
                './/div[contains(@class,"textType")]/a/text()')[0].strip()
            if version:
                path = row.xpath(
                    './/div[contains(@class,"textType")]/a/@href')[0].strip()
                summary_name = "Bill Summary ({})".format(version)
                if ".pdf" in path:
                    mimetype = "application/pdf"
                else:
                    mimetype = "text/html"
                bill.add_document_link(summary_name,
                                       path,
                                       media_type=mimetype,
                                       on_duplicate="ignore")

        # house bill amendments
        amendment_rows = bill_page.xpath(
            '//div[h2[contains(text(),"Amendment")]]/'
            'following-sibling::div[contains(@class,"DocRow")]')

        for row in reversed(amendment_rows):
            version = row.xpath(
                './/div[contains(@class,"DocInfoCell")]/a[1]/text()')[0].strip(
                )
            path = row.xpath(
                './/div[contains(@class,"DocInfoCell")]/a[1]/@href')[0].strip(
                )
            summary_name = "Amendment {}".format(version)

            defeated_icon = row.xpath('.//img[contains(@title,"Defeated")]')
            if defeated_icon:
                summary_name = "{} (Defeated)".format(summary_name)

            adopted_icon = row.xpath('.//img[contains(@title,"Adopted")]')
            if adopted_icon:
                summary_name = "{} (Adopted)".format(summary_name)

            distributed_icon = row.xpath(
                './/img[contains(@title,"Distributed")]')
            if distributed_icon:
                summary_name = "{} (Distributed)".format(summary_name)

            if ".pdf" in path:
                mimetype = "application/pdf"
            else:
                mimetype = "text/html"
            bill.add_version_link(summary_name,
                                  path,
                                  media_type=mimetype,
                                  on_duplicate="ignore")

        yield bill
Пример #3
0
    def _parse_senate_billpage(self, bill_url, year):
        bill_page = self.lxmlize(bill_url)

        # get all the info needed to record the bill
        # TODO probably still needs to be fixed
        bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content()
        bill_title = bill_page.xpath(
            '//*[@id="lblBillTitle"]')[0].text_content()
        bill_desc = bill_page.xpath(
            '//*[@id="lblBriefDesc"]')[0].text_content()
        # bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content()

        bill_type = "bill"
        triplet = bill_id[:3]
        if triplet in bill_types:
            bill_type = bill_types[triplet]

        subs = []
        bid = bill_id.replace(" ", "")

        if bid in self._subjects:
            subs = self._subjects[bid]
            self.info("With subjects for this bill")

        self.info(bid)

        if bid == "XXXXXX":
            self.info("Skipping Junk Bill")
            return

        bill = Bill(
            bill_id,
            title=bill_desc,
            chamber="upper",
            legislative_session=self._session_id,
            classification=bill_type,
        )
        bill.subject = subs
        bill.add_abstract(bill_desc, note="abstract")
        bill.add_source(bill_url)

        if bill_title:
            bill.add_title(bill_title)

        # Get the primary sponsor
        try:
            sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0]
        except IndexError:
            sponsor = bill_page.xpath('//span[@id="lSponsor"]')[0]

        bill_sponsor = sponsor.text_content()
        # bill_sponsor_link = sponsor.attrib.get('href')
        bill.add_sponsorship(bill_sponsor,
                             entity_type="person",
                             classification="primary",
                             primary=True)

        # cosponsors show up on their own page, if they exist
        cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]')
        if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get("href"):
            self._parse_senate_cosponsors(bill,
                                          cosponsor_tag[0].attrib["href"])

        # get the actions
        action_url = bill_page.xpath('//a[@id="hlAllActions"]')
        if len(action_url) > 0:
            action_url = action_url[0].attrib["href"]
            self._parse_senate_actions(bill, action_url)

        # stored on a separate page
        versions_url = bill_page.xpath('//a[@id="hlFullBillText"]')
        if len(versions_url) > 0 and versions_url[0].attrib.get("href"):
            self._parse_senate_bill_versions(bill,
                                             versions_url[0].attrib["href"])

        amendment_links = bill_page.xpath(
            '//a[contains(@href,"ShowAmendment.asp")]')
        for link in amendment_links:
            link_text = link.xpath("string(.)").strip()
            if "adopted" in link_text.lower():
                link_url = link.xpath("@href")[0]
                bill.add_version_link(
                    link_text,
                    link_url,
                    media_type="application/pdf",
                    on_duplicate="ignore",
                )

        yield bill
Пример #4
0
    def scrape_bill(self, row, session):
        bill_id = row["LegislationDisplayCode"]

        amendment = None
        substitute = None

        if bill_id.count(" ") > 1:
            if " w/ " in bill_id:
                self.info("Found amended bill `{}`".format(bill_id))
                bill_id, amendment = bill_id.split(" w/ ")
            if " -" in bill_id:
                self.info("Found amended bill `{}`".format(bill_id))
                bill_id, amendment = bill_id.split(" -")
            # A bill can _both_ be amended and be substituted
            if " for " in bill_id:
                self.info(
                    "Found substitute to use instead: `{}`".format(bill_id))
                substitute, bill_id = bill_id.split(" for ")
            if amendment is None and substitute is None:
                raise ValueError("unknown bill_id format: " + bill_id)

        bill_type = self.classify_bill(bill_id)
        chamber = "upper" if bill_id.startswith("S") else "lower"

        bill = Bill(
            identifier=bill_id,
            legislative_session=session,
            chamber=chamber,
            title=row["LongTitle"],
            classification=bill_type,
        )
        if row["Synopsis"]:
            bill.add_abstract(row["Synopsis"], "synopsis")
        if row["ShortTitle"]:
            bill.add_title(row["ShortTitle"], "short title")
        if row["SponsorPersonId"]:
            self.add_sponsor_by_legislator_id(bill, row["SponsorPersonId"],
                                              "primary")
        if substitute:
            bill.extras["substitute"] = substitute
        if amendment:
            bill.extras["amendment"] = amendment

        # TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API?
        html_url = "https://legis.delaware.gov/BillDetail?LegislationId={}".format(
            row["LegislationId"])
        bill.add_source(html_url, note="text/html")

        html = self.lxmlize(html_url)

        additional_sponsors = html.xpath(
            '//label[text()="Additional Sponsor(s):"]'
            "/following-sibling::div/a/@href")
        for sponsor_url in additional_sponsors:
            sponsor_id = sponsor_url.replace(
                "https://legis.delaware.gov/LegislatorDetail?"
                "personId=", "")
            self.add_sponsor_by_legislator_id(bill, sponsor_id, "primary")

        cosponsors = html.xpath('//label[text()="Co-Sponsor(s):"]/'
                                "following-sibling::div/a/@href")
        for sponsor_url in cosponsors:
            sponsor_id = sponsor_url.replace(
                "https://legis.delaware.gov/LegislatorDetail?"
                "personId=", "")
            self.add_sponsor_by_legislator_id(bill, sponsor_id, "cosponsor")

        versions = html.xpath(
            '//label[text()="Original Text:"]/following-sibling::div/a/@href')
        for version_url in versions:
            media_type = self.mime_from_link(version_url)
            version_name = "Bill Text"
            bill.add_version_link(version_name,
                                  version_url,
                                  media_type=media_type)

        fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href')
        for fiscal in fiscals:
            self.scrape_fiscal_note(bill, fiscal)

        self.scrape_actions(bill, row["LegislationId"])

        if row["HasAmendments"] is True:
            self.scrape_amendments(bill, row["LegislationId"])

        yield from self.scrape_votes(bill, row["LegislationId"], session)

        yield bill
Пример #5
0
    def scrape_bill(self, chamber, session, bill_id, short_title=None):
        """
        Scrapes documents, actions, vote counts and votes for
        bills from the 2009 session and above.
        """
        url = BILL_URL % (session, bill_id.replace(" ", ""))
        bill_page = self.get(url).text
        html = lxml.html.fromstring(bill_page)
        html.make_links_absolute(
            "https://legislature.idaho.gov/legislation/%s/" % session)
        bill_tables = html.xpath('//table[contains(@class, "bill-table")]')
        title = bill_tables[1].text_content().strip()
        bill_type = get_bill_type(bill_id)
        bill = Bill(
            legislative_session=session,
            chamber=chamber,
            identifier=bill_id,
            title=title,
            classification=bill_type,
        )
        bill.add_source(url)
        for subject in self._subjects[bill_id.replace(" ", "")]:
            bill.add_subject(subject)

        if short_title and title.lower() != short_title.lower():
            bill.add_title(short_title, "short title")

        # documents
        doc_links = html.xpath('//div[contains(@class,"insert-page")]//a')
        for link in doc_links:
            name = link.text_content().strip()
            href = link.get("href")
            if "Engrossment" in name or "Bill Text" in name or "Amendment" in name:
                bill.add_version_link(note=name,
                                      url=href,
                                      media_type="application/pdf")
            else:
                bill.add_document_link(note=name,
                                       url=href,
                                       media_type="application/pdf")

        def _split(string):
            return re.split(r"\w+[,|AND]\s+", string)

        # sponsors range from a committee to one legislator to a group of legs
        sponsor_lists = bill_tables[0].text_content().split("by")
        if len(sponsor_lists) > 1:
            for sponsors in sponsor_lists[1:]:
                if "COMMITTEE" in sponsors.upper():
                    bill.add_sponsorship(
                        name=sponsors.strip(),
                        entity_type="organization",
                        primary=True,
                        classification="primary",
                    )
                else:
                    for person in _split(sponsors):
                        person = person.strip()
                        if person != "":
                            bill.add_sponsorship(
                                classification="primary",
                                name=person,
                                entity_type="person",
                                primary=True,
                            )

        actor = chamber
        last_date = None
        # if a bill has passed a chamber or been 'received from'
        # then the next committee passage is in the opposite chamber
        has_moved_chambers = False
        for row in bill_tables[2]:
            # lots of empty rows
            if len(row) == 1:
                continue
            _, date, action, _ = [x.text_content().strip() for x in row]

            if date:
                last_date = date
            else:
                date = last_date
            date = datetime.datetime.strptime(date + "/" + session[0:4],
                                              "%m/%d/%Y").strftime("%Y-%m-%d")
            if action.startswith("House"):
                actor = "lower"
            elif action.startswith("Senate"):
                actor = "upper"

            # votes
            if "AYES" in action or "NAYS" in action:
                yield from self.parse_vote(actor, date, row[2], session,
                                           bill_id, chamber, url)
                # bill.add_vote_event(vote)
            # some td's text is seperated by br elements
            if len(row[2]):
                action = "".join(row[2].itertext())
            action = action.replace("\xa0", " ").strip()
            atype = get_action(actor, action)
            if atype and "passage" in atype:
                has_moved_chambers = True

            if atype and "committee-passage" in atype and has_moved_chambers:
                actor = _OTHER_CHAMBERS[actor]

            bill.add_action(action, date, chamber=actor, classification=atype)
            # after voice vote/roll call and some actions the bill is sent
            # 'to House' or 'to Senate'
            if "to House" in action:
                actor = "lower"
            elif "to Senate" in action:
                actor = "upper"
        yield bill
    def scrape_bill_type(
            self,
            chamber,
            session,
            bill_type,
            type_abbr,
            committee_abbr_regex=get_committee_name_regex(),
    ):
        bills = (self.session.query(CABill).filter_by(
            session_year=session).filter_by(measure_type=type_abbr))

        archive_year = int(session[0:4])
        not_archive_year = archive_year >= 2009

        for bill in bills:
            bill_session = session
            if bill.session_num != "0":
                bill_session += " Special Session %s" % bill.session_num

            bill_id = bill.short_bill_id
            if bill_id.strip() == "SB77" and session == "20052006":
                continue

            fsbill = Bill(bill_id, bill_session, title="", chamber=chamber)
            if (bill_id.startswith("S")
                    and chamber == "lower") or (bill_id.startswith("A")
                                                and chamber == "upper"):
                print("!!!! BAD ID/CHAMBER PAIR !!!!", bill)
                continue

            # Construct a fake source url
            source_url = ("http://leginfo.legislature.ca.gov/faces/"
                          "billNavClient.xhtml?bill_id=%s") % bill.bill_id

            fsbill.add_source(source_url)
            fsbill.add_version_link(bill_id,
                                    source_url,
                                    media_type="text/html")

            title = ""
            type_ = ["bill"]
            subject = ""
            all_titles = set()
            summary = ""

            # Get digest test (aka "summary") from latest version.
            if bill.versions and not_archive_year:
                version = bill.versions[-1]
                nsmap = version.xml.nsmap
                xpath = "//caml:DigestText/xhtml:p"
                els = version.xml.xpath(xpath, namespaces=nsmap)
                chunks = []
                for el in els:
                    t = etree_text_content(el)
                    t = re.sub(r"\s+", " ", t)
                    t = re.sub(r"\)(\S)", lambda m: ") %s" % m.group(1), t)
                    chunks.append(t)
                summary = "\n\n".join(chunks)

            for version in bill.versions:
                if not version.bill_xml:
                    continue

                version_date = self._tz.localize(
                    version.bill_version_action_date)

                # create a version name to match the state's format
                # 02/06/17 - Enrolled
                version_date_human = version_date.strftime("%m/%d/%y")
                version_name = "{} - {}".format(version_date_human,
                                                version.bill_version_action)

                version_base = "https://leginfo.legislature.ca.gov/faces"

                version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format(
                    version_base, version.bill_id, version.bill_version_id)

                fsbill.add_version_link(
                    version_name,
                    version_url_pdf,
                    media_type="application/pdf",
                    date=version_date.date(),
                )

                # CA is inconsistent in that some bills have a short title
                # that is longer, more descriptive than title.
                if bill.measure_type in ("AB", "SB"):
                    impact_clause = clean_title(version.title)
                    title = clean_title(version.short_title)
                else:
                    impact_clause = None
                    if len(version.title) < len(
                            version.short_title) and not version.title.lower(
                            ).startswith("an act"):
                        title = clean_title(version.short_title)
                    else:
                        title = clean_title(version.title)

                if title:
                    all_titles.add(title)

                type_ = [bill_type]

                if version.appropriation == "Yes":
                    type_.append("appropriation")

                tags = []
                if version.fiscal_committee == "Yes":
                    tags.append("fiscal committee")
                if version.local_program == "Yes":
                    tags.append("local program")
                if version.urgency == "Yes":
                    tags.append("urgency")
                if version.taxlevy == "Yes":
                    tags.append("tax levy")

                if version.subject:
                    subject = clean_title(version.subject)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill.title = title
            if summary:
                fsbill.add_abstract(summary, note="summary")
            fsbill.classification = type_
            fsbill.subject = [subject] if subject else []
            fsbill.extras["impact_clause"] = impact_clause
            fsbill.extras["tags"] = tags

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            for title in all_titles:
                fsbill.add_title(title)

            for author in version.authors:
                fsbill.add_sponsorship(
                    author.name,
                    classification=SPONSOR_TYPES[author.contribution],
                    primary=author.primary_author_flg == "Y",
                    entity_type="person",
                )
                # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution}

            seen_actions = set()
            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r"(Assembly|Senate)($| \(Floor)", actor)
                if match:
                    actor = {
                        "Assembly": "lower",
                        "Senate": "upper"
                    }[match.group(1)]
                elif actor.startswith("Governor"):
                    actor = "executive"
                else:

                    def replacer(matchobj):
                        if matchobj:
                            return {
                                "Assembly": "lower",
                                "Senate": "upper"
                            }[matchobj.group()]
                        else:
                            return matchobj.group()

                    actor = re.sub(r"^(Assembly|Senate)", replacer, actor)

                type_ = []

                act_str = action.action
                act_str = re.sub(r"\s+", " ", act_str)

                attrs = self.categorizer.categorize(act_str)

                # Add in the committee strings of the related committees, if any.
                kwargs = attrs
                matched_abbrs = committee_abbr_regex.findall(action.action)

                if re.search(r"Com[s]?. on",
                             action.action) and not matched_abbrs:
                    msg = "Failed to extract committee abbr from %r."
                    self.logger.warning(msg % action.action)

                if matched_abbrs:
                    committees = []
                    for abbr in matched_abbrs:
                        try:
                            name = self.committee_abbr_to_name(chamber, abbr)
                            committees.append(name)
                        except KeyError:
                            msg = ("Mapping contains no committee name for "
                                   "abbreviation %r. Action text was %r.")
                            args = (abbr, action.action)
                            self.warning(msg % args)

                    committees = filter(None, committees)
                    kwargs["committees"] = committees

                    code = re.search(r"C[SXZ]\d+", actor)
                    if code is not None:
                        code = code.group()
                        kwargs["actor_info"] = {"committee_code": code}
                    if not_archive_year:
                        assert len(list(committees)) == len(matched_abbrs)
                    for committee, abbr in zip(committees, matched_abbrs):
                        act_str = act_str.replace("Coms. on ", "")
                        act_str = act_str.replace("Com. on " + abbr, committee)
                        act_str = act_str.replace(abbr, committee)
                        if not act_str.endswith("."):
                            act_str = act_str + "."

                # Determine which chamber the action originated from.
                changed = False
                for committee_chamber in ["upper", "lower", "legislature"]:
                    if actor.startswith(committee_chamber):
                        actor = committee_chamber
                        changed = True
                        break
                if not changed:
                    actor = "legislature"

                if actor != action.actor:
                    actor_info = kwargs.get("actor_info", {})
                    actor_info["details"] = action.actor
                    kwargs["actor_info"] = actor_info

                # Add strings for related legislators, if any.
                rgx = r"(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+"
                legislators = re.findall(rgx, action.action, re.I)
                if legislators:
                    kwargs["legislators"] = legislators

                date = action.action_date
                date = self._tz.localize(date)
                date = date.date()
                if (actor, act_str, date) in seen_actions:
                    continue

                kwargs.update(self.categorizer.categorize(act_str))

                action = fsbill.add_action(
                    act_str,
                    date.strftime("%Y-%m-%d"),
                    chamber=actor,
                    classification=kwargs["classification"],
                )
                for committee in kwargs.get("committees", []):
                    action.add_related_entity(committee,
                                              entity_type="organization")
                seen_actions.add((actor, act_str, date))

            source_url = (
                "http://leginfo.legislature.ca.gov/faces/billVotesClient.xhtml?"
            )
            source_url += f"bill_id={session}{bill.session_num}{fsbill.identifier}"

            # Votes for non archived years
            if archive_year > 2009:
                for vote_num, vote in enumerate(bill.votes):
                    if vote.vote_result == "(PASS)":
                        result = True
                    else:
                        result = False

                    if not vote.location:
                        continue

                    full_loc = vote.location.description
                    first_part = full_loc.split(" ")[0].lower()
                    if first_part in ["asm", "assembly"]:
                        vote_chamber = "lower"
                        # vote_location = ' '.join(full_loc.split(' ')[1:])
                    elif first_part.startswith("sen"):
                        vote_chamber = "upper"
                        # vote_location = ' '.join(full_loc.split(' ')[1:])
                    else:
                        # raise ScrapeError("Bad location: %s" % full_loc) # To uncomment
                        continue

                    if vote.motion:
                        motion = vote.motion.motion_text or ""
                    else:
                        motion = ""

                    if "Third Reading" in motion or "3rd Reading" in motion:
                        vtype = "passage"
                    elif "Do Pass" in motion:
                        vtype = "passage"
                    else:
                        vtype = "other"

                    motion = motion.strip()
                    motion = re.compile(r"(\w+)( Extraordinary)? Session$",
                                        re.IGNORECASE).sub("", motion)
                    motion = re.compile(r"^(Senate|Assembly) ",
                                        re.IGNORECASE).sub("", motion)
                    motion = re.sub(r"^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ",
                                    "", motion)
                    motion = re.sub(r" \(\w+\)$", "", motion)
                    motion = re.sub(r"(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$", "",
                                    motion)
                    motion = re.sub(
                        r"(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? "
                        r"Urgency Clause$",
                        "(Urgency Clause)",
                        motion,
                    )
                    motion = re.sub(r"\s+", " ", motion)

                    if not motion:
                        self.warning("Got blank motion on vote for %s" %
                                     bill_id)
                        continue

                    # XXX this is responsible for all the CA 'committee' votes, not
                    # sure if that's a feature or bug, so I'm leaving it as is...
                    # vote_classification = chamber if (vote_location == 'Floor') else 'committee'
                    # org = {
                    # 'name': vote_location,
                    # 'classification': vote_classification
                    # }

                    fsvote = VoteEvent(
                        motion_text=motion,
                        start_date=self._tz.localize(vote.vote_date_time),
                        result="pass" if result else "fail",
                        classification=vtype,
                        # organization=org,
                        chamber=vote_chamber,
                        bill=fsbill,
                    )
                    fsvote.extras = {"threshold": vote.threshold}

                    fsvote.add_source(source_url)
                    fsvote.pupa_id = source_url + "#" + str(vote_num)

                    rc = {"yes": [], "no": [], "other": []}
                    for record in vote.votes:
                        if record.vote_code == "AYE":
                            rc["yes"].append(record.legislator_name)
                        elif record.vote_code.startswith("NO"):
                            rc["no"].append(record.legislator_name)
                        else:
                            rc["other"].append(record.legislator_name)

                    # Handle duplicate votes
                    for key in rc.keys():
                        rc[key] = list(set(rc[key]))

                    for key, voters in rc.items():
                        for voter in voters:
                            fsvote.vote(key, voter)
                        # Set counts by summed votes for accuracy
                        fsvote.set_count(key, len(voters))

                    yield fsvote
            if len(bill.votes) > 0 and archive_year <= 2009:
                vote_page_url = (
                    "http://leginfo.legislature.ca.gov/faces/billVotesClient.xhtml?"
                )
                vote_page_url += (
                    f"bill_id={session}{bill.session_num}{fsbill.identifier}")

                # parse the bill data page, finding the latest html text
                data = self.get(vote_page_url).content
                doc = html.fromstring(data)
                doc.make_links_absolute(vote_page_url)
                num_of_votes = len(doc.xpath("//div[@class='status']"))
                for vote_section in range(1, num_of_votes + 1):
                    lines = doc.xpath(
                        f"//div[@class='status'][{vote_section}]//div[@class='statusRow']"
                    )
                    date, result, motion, vtype, location = "", "", "", "", ""
                    votes = {}
                    for line in lines:
                        line = line.text_content().split()
                        if line[0] == "Date":
                            date = line[1]
                            date = datetime.datetime.strptime(date, "%m/%d/%y")
                            date = self._tz.localize(date)
                        elif line[0] == "Result":
                            result = "pass" if "PASS" in line[1] else "fail"
                        elif line[0] == "Motion":
                            motion = " ".join(line[1:])
                        elif line[0] == "Location":
                            location = " ".join(line[1:])
                        elif len(line) > 1:
                            if line[0] == "Ayes" and line[1] != "Count":
                                votes["yes"] = line[1:]
                            elif line[0] == "Noes" and line[1] != "Count":
                                votes["no"] = line[1:]
                            elif line[0] == "NVR" and line[1] != "Count":
                                votes["not voting"] = line[1:]
                    # Determine chamber based on location
                    first_part = location.split(" ")[0].lower()
                    vote_chamber = ""
                    if first_part in ["asm", "assembly"]:
                        vote_chamber = "lower"
                    elif first_part.startswith("sen"):
                        vote_chamber = "upper"

                    if "Third Reading" in motion or "3rd Reading" in motion:
                        vtype = "passage"
                    elif "Do Pass" in motion:
                        vtype = "passage"
                    else:
                        vtype = "other"
                    if len(motion) > 0:
                        fsvote = VoteEvent(
                            motion_text=motion,
                            start_date=date,
                            result=result,
                            classification=vtype,
                            chamber=vote_chamber,
                            bill=fsbill,
                        )
                        fsvote.add_source(vote_page_url)
                        fsvote.pupa_id = vote_page_url + "#" + str(
                            vote_section)

                        for how_voted, voters in votes.items():
                            for voter in voters:
                                voter = voter.replace(",", "")
                                fsvote.vote(how_voted, voter)
                        yield fsvote

            yield fsbill
            self.session.expire_all()
Пример #7
0
    def scrape(self, session=None, chambers=None):
        # Bills endpoint can sometimes take a very long time to load
        self.timeout = 300

        if not session:
            session = self.latest_session()
            self.info("no session, using %s", session)

        if int(session) < 128:
            raise AssertionError("No data for period {}".format(session))

        elif int(session) < 131:
            # they changed their data format starting in 131st and added
            # an undocumented API
            yield from self.old_scrape(session)

        else:
            chamber_dict = {
                "Senate": "upper",
                "House": "lower",
                "House of Representatives": "lower",
                "house": "lower",
                "senate": "upper",
            }

            # so presumably not everything passes, but we haven't
            # seen anything not pass yet, so we'll need to wait
            # till it fails and get the right language in here
            vote_results = {
                "approved": True,
                "passed": True,
                "adopted": True,
                "true": True,
                "false": False,
                "failed": False,
                True: True,
                False: False,
            }

            action_dict = {
                "ref_ctte_100": "referral-committee",
                "intro_100": "introduction",
                "intro_101": "introduction",
                "pass_300": "passage",
                "intro_110": "reading-1",
                "refer_210": "referral-committee",
                "crpt_301": None,
                "crpt_317": None,
                "concur_606": "passage",
                "pass_301": "passage",
                "refer_220": "referral-committee",
                "intro_102": ["introduction", "passage"],
                "intro_105": ["introduction", "passage"],
                "intro_ref_ctte_100": "referral-committee",
                "refer_209": None,
                "intro_108": ["introduction", "passage"],
                "intro_103": ["introduction", "passage"],
                "msg_reso_503": "passage",
                "intro_107": ["introduction", "passage"],
                "imm_consid_360": "passage",
                "refer_213": None,
                "adopt_reso_100": "passage",
                "adopt_reso_110": "passage",
                "msg_507": "amendment-passage",
                "confer_713": None,
                "concur_603": None,
                "confer_712": None,
                "msg_506": "amendment-failure",
                "receive_message_100": "passage",
                "motion_920": None,
                "concur_611": None,
                "confer_735": None,
                "third_429": None,
                "final_501": None,
                "concur_608": None,
                "infpass_217": "passage",
            }

            base_url = "https://search-prod.lis.state.oh.us"
            first_page = base_url
            first_page += "/solarapi/v1/general_assembly_{session}/".format(
                session=session)
            legislators = self.get_legislator_ids(first_page)
            all_amendments = self.get_other_data_source(
                first_page, base_url, "amendments")
            all_fiscals = self.get_other_data_source(first_page, base_url,
                                                     "fiscals")
            all_synopsis = self.get_other_data_source(first_page, base_url,
                                                      "synopsiss")
            all_analysis = self.get_other_data_source(first_page, base_url,
                                                      "analysiss")

            for row in self.get_bill_rows(session):
                (
                    spacer,
                    number_link,
                    _ga,
                    title,
                    primary_sponsor,
                    status,
                    spacer,
                ) = row.xpath("td")

                # S.R.No.1 -> SR1
                bill_id = number_link.text_content().replace("No.", "")
                bill_id = bill_id.replace(".", "").replace(" ", "")
                # put one space back in between type and number
                bill_id = re.sub(r"([a-zA-Z]+)(\d+)", r"\1 \2", bill_id)

                title = title.text_content().strip()
                title = re.sub(r"^Title", "", title)

                chamber = "lower" if "H" in bill_id else "upper"
                classification = "bill" if "B" in bill_id else "resolution"

                if not title and session == "134" and bill_id == "HR 35":
                    # Exception for HR 35 which is a real bill
                    title = "No title provided"
                elif not title:
                    self.warning(f"no title for {bill_id}, skipping")
                    continue
                bill = Bill(
                    bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=classification,
                )
                bill.add_source(number_link.xpath("a/@href")[0])

                if (session, bill_id) in BAD_BILLS:
                    self.logger.warning(
                        f"Skipping details for known bad bill {bill_id}")
                    yield bill
                    continue

                # get bill from API
                bill_api_url = (
                    "https://search-prod.lis.state.oh.us/solarapi/v1/"
                    "general_assembly_{}/{}/{}/".format(
                        session,
                        "bills" if "B" in bill_id else "resolutions",
                        bill_id.lower().replace(" ", ""),
                    ))
                data = self.get(bill_api_url, verify=False).json()
                if len(data["items"]) == 0:
                    self.logger.warning(
                        "Data for bill {bill_id} has empty 'items' array,"
                        " cannot process related information".format(
                            bill_id=bill_id.lower().replace(" ", "")))
                    yield bill
                    continue

                # add title if no short title
                if not bill.title:
                    bill.title = data["items"][0]["longtitle"]
                bill.add_title(data["items"][0]["longtitle"], "long title")

                # this stuff is version-specific
                for version in data["items"]:
                    version_name = version["version"]
                    version_link = base_url + version["pdfDownloadLink"]
                    bill.add_version_link(version_name,
                                          version_link,
                                          media_type="application/pdf")

                # we'll use latest bill_version for everything else
                bill_version = data["items"][0]
                bill.add_source(bill_api_url)

                # subjects
                for subj in bill_version["subjectindexes"]:
                    try:
                        bill.add_subject(subj["primary"])
                    except KeyError:
                        pass
                    try:
                        secondary_subj = subj["secondary"]
                    except KeyError:
                        secondary_subj = ""
                    if secondary_subj:
                        bill.add_subject(secondary_subj)

                # sponsors
                sponsors = bill_version["sponsors"]
                for sponsor in sponsors:
                    sponsor_name = self.get_sponsor_name(sponsor)
                    bill.add_sponsorship(
                        sponsor_name,
                        classification="primary",
                        entity_type="person",
                        primary=True,
                    )

                cosponsors = bill_version["cosponsors"]
                for sponsor in cosponsors:
                    sponsor_name = self.get_sponsor_name(sponsor)
                    bill.add_sponsorship(
                        sponsor_name,
                        classification="cosponsor",
                        entity_type="person",
                        primary=False,
                    )

                try:
                    action_doc = self.get(base_url +
                                          bill_version["action"][0]["link"])
                except scrapelib.HTTPError:
                    pass
                else:

                    actions = action_doc.json()
                    for action in reversed(actions["items"]):
                        actor = chamber_dict[action["chamber"]]
                        action_desc = action["description"]
                        try:
                            action_type = action_dict[action["actioncode"]]
                        except KeyError:
                            self.warning(
                                "Unknown action {desc} with code {code}."
                                " Add it to the action_dict"
                                ".".format(desc=action_desc,
                                           code=action["actioncode"]))
                            action_type = None

                        date = self._tz.localize(
                            datetime.datetime.strptime(action["datetime"],
                                                       "%Y-%m-%dT%H:%M:%S"))
                        date = "{:%Y-%m-%d}".format(date)

                        bill.add_action(action_desc,
                                        date,
                                        chamber=actor,
                                        classification=action_type)

                # attach documents gathered earlier
                self.add_document(all_amendments, bill_id, "amendment", bill,
                                  base_url)
                self.add_document(all_fiscals, bill_id, "fiscal", bill,
                                  base_url)
                self.add_document(all_synopsis, bill_id, "synopsis", bill,
                                  base_url)
                self.add_document(all_analysis, bill_id, "analysis", bill,
                                  base_url)

                # votes
                vote_url = base_url + bill_version["votes"][0]["link"]
                try:
                    vote_doc = self.get(vote_url)
                except scrapelib.HTTPError:
                    self.warning(
                        "Vote page not loading; skipping: {}".format(vote_url))
                    yield bill
                    continue
                votes = vote_doc.json()
                yield from self.process_vote(
                    votes,
                    vote_url,
                    base_url,
                    bill,
                    legislators,
                    chamber_dict,
                    vote_results,
                )

                vote_url = base_url
                vote_url += bill_version["cmtevotes"][0]["link"]
                try:
                    vote_doc = self.get(vote_url)
                except scrapelib.HTTPError:
                    self.warning(
                        "Vote page not loading; skipping: {}".format(vote_url))
                    yield bill
                    continue
                votes = vote_doc.json()
                yield from self.process_vote(
                    votes,
                    vote_url,
                    base_url,
                    bill,
                    legislators,
                    chamber_dict,
                    vote_results,
                )

                if data["items"][0]["effective_date"]:
                    effective_date = datetime.datetime.strptime(
                        data["items"][0]["effective_date"], "%Y-%m-%d")
                    effective_date = self._tz.localize(effective_date)
                    # the OH website adds an action that isn't in the action list JSON.
                    # It looks like:
                    # Effective 7/6/18
                    effective_date_oh = "{:%-m/%-d/%y}".format(effective_date)
                    effective_action = "Effective {}".format(effective_date_oh)
                    bill.add_action(
                        effective_action,
                        effective_date,
                        chamber="executive",
                        classification=["became-law"],
                    )

                # we have never seen a veto or a disapprove, but they seem important.
                # so we'll check and throw an error if we find one
                # life is fragile. so are our scrapers.
                if "veto" in bill_version:
                    veto_url = base_url + bill_version["veto"][0]["link"]
                    veto_json = self.get(veto_url).json()
                    if len(veto_json["items"]) > 0:
                        raise AssertionError("Whoa, a veto! We've never"
                                             " gotten one before."
                                             " Go write some code to deal"
                                             " with it: {}".format(veto_url))

                if "disapprove" in bill_version:
                    disapprove_url = base_url + bill_version["disapprove"][0][
                        "link"]
                    disapprove_json = self.get(disapprove_url).json()
                    if len(disapprove_json["items"]) > 0:
                        raise AssertionError(
                            "Whoa, a disapprove! We've never"
                            " gotten one before."
                            " Go write some code to deal "
                            "with it: {}".format(disapprove_url))

                yield bill
    def scrape_chamber(self, chamber, session):

        # Pull the session metadata so we can get the
        # slug for the API Request
        meta = next(each for each in self.jurisdiction.legislative_sessions
                    if each["identifier"] == session)
        if meta["classification"] == "special":
            list_slug = self.special_slugs[session]
        else:
            list_slug = 'li'

        list_url = "http://www.kslegislature.org/{}" \
                   "/api/v11/rev-1/bill_status"
        list_url = list_url.format(list_slug)

        chamber_name = "Senate" if chamber == "upper" else "House"
        chamber_letter = chamber_name[0]
        # perhaps we should save this data so we can make one request for both?
        bill_request = self.get(list_url).text
        bill_request_json = json.loads(bill_request)
        bills = bill_request_json["content"]

        # there are duplicates
        seen_ids = set()

        for bill_data in bills:

            bill_id = bill_data["BILLNO"]

            # filter other chambers
            if not bill_id.startswith(chamber_letter):
                continue
            # filter duplicates
            if bill_id in seen_ids:
                continue

            seen_ids.add(bill_id)

            if "CR" in bill_id:
                btype = "concurrent resolution"
            elif "R" in bill_id:
                btype = "resolution"
            elif "B" in bill_id:
                btype = "bill"

            title = bill_data["SHORTTITLE"] or bill_data["LONGTITLE"]

            # main
            bill = Bill(bill_id,
                        session,
                        title,
                        chamber=chamber,
                        classification=btype)
            bill.extras = {"status": bill_data["STATUS"]}

            bill.add_source(ksapi.url + "bill_status/" + bill_id.lower())

            if bill_data["LONGTITLE"] and bill_data["LONGTITLE"] != bill.title:
                bill.add_title(bill_data["LONGTITLE"])

            # An "original sponsor" is the API's expression of "primary sponsor"
            for primary_sponsor in bill_data["ORIGINAL_SPONSOR"]:
                primary_sponsor = self.clean_sponsor_name(primary_sponsor)
                bill.add_sponsorship(
                    name=primary_sponsor,
                    entity_type="organization"
                    if "committee" in primary_sponsor.lower() else "person",
                    primary=True,
                    classification="original sponsor",
                )
            for sponsor in bill_data["SPONSOR_NAMES"]:
                if sponsor in bill_data["ORIGINAL_SPONSOR"]:
                    continue
                sponsor = self.clean_sponsor_name(sponsor)
                bill.add_sponsorship(
                    name=sponsor,
                    entity_type="organization"
                    if "committee" in sponsor.lower() else "person",
                    primary=False,
                    classification="cosponsor",
                )

            # history is backwards
            for event in reversed(bill_data["HISTORY"]):
                actor = "upper" if event["chamber"] == "Senate" else "lower"

                date = event["session_date"]
                # append committee names if present
                if "committee_names" in event:
                    action = (event["status"] + " " +
                              " and ".join(event["committee_names"]))
                else:
                    action = event["status"]

                if event["action_code"] not in ksapi.action_codes:
                    self.warning(
                        "unknown action code on %s: %s %s" %
                        (bill_id, event["action_code"], event["status"]))
                    atype = None
                else:
                    atype = ksapi.action_codes[event["action_code"]]
                bill.add_action(action,
                                date,
                                chamber=actor,
                                classification=atype)

            # Versions are exposed in `bill_data['versions'],
            # but lack any descriptive text or identifiers;
            # continue to scrape these from the HTML
            yield from self.scrape_html(bill, session)

            yield bill
def test_full_bill():
    create_jurisdiction()
    person = Person.objects.create(name="Adam Smith")
    lower = Organization.objects.create(jurisdiction_id="jid",
                                        name="House",
                                        classification="lower")
    Membership.objects.create(person_id=person.id, organization_id=lower.id)
    Organization.objects.create(
        jurisdiction_id="jid",
        name="Arbitrary Committee",
        classification="committee",
        parent=lower,
    )

    oldbill = ScrapeBill(
        "HB 99",
        "1899",
        "Axe & Tack Tax Act",
        classification="tax bill",
        chamber="lower",
    )

    bill = ScrapeBill("HB 1",
                      "1900",
                      "Axe & Tack Tax Act",
                      classification="tax bill",
                      chamber="lower")
    bill.subject = ["taxes", "axes"]
    bill.add_identifier("SB 9")
    bill.add_title("Tack & Axe Tax Act")
    bill.add_action("introduced in house", "1900-04-01", chamber="lower")
    act = bill.add_action("sent to arbitrary committee",
                          "1900-04-04",
                          chamber="lower")
    act.add_related_entity(
        "arbitrary committee",
        "organization",
        _make_pseudo_id(name="Arbitrary Committee"),
    )
    bill.add_related_bill("HB 99",
                          legislative_session="1899",
                          relation_type="prior-session")
    bill.add_sponsorship(
        "Adam Smith",
        classification="extra sponsor",
        entity_type="person",
        primary=False,
        entity_id=_make_pseudo_id(name="Adam Smith"),
    )
    bill.add_sponsorship("Jane Smith",
                         classification="lead sponsor",
                         entity_type="person",
                         primary=True)
    bill.add_abstract(
        "This is an act about axes and taxes and tacks.",
        note="official",
        date="1969-10-20",
    )
    bill.add_document_link("Fiscal Note",
                           "http://example.com/fn.pdf",
                           media_type="application/pdf")
    bill.add_document_link("Fiscal Note",
                           "http://example.com/fn.html",
                           media_type="text/html")
    bill.add_version_link("Fiscal Note",
                          "http://example.com/v/1",
                          media_type="text/html")
    bill.add_source("http://example.com/source")

    # import bill
    BillImporter("jid").import_data([oldbill.as_dict(), bill.as_dict()])

    # get bill from db and assert it imported correctly
    b = Bill.objects.get(identifier="HB 1")
    assert b.from_organization.classification == "lower"
    assert b.identifier == bill.identifier
    assert b.title == bill.title
    assert b.classification == bill.classification
    assert b.subject == ["taxes", "axes"]
    assert b.abstracts.get().note == "official"
    assert b.abstracts.get().date == "1969-10-20"

    # other_title, other_identifier added
    assert b.other_titles.get().title == "Tack & Axe Tax Act"
    assert b.other_identifiers.get().identifier == "SB 9"

    # actions
    actions = list(b.actions.all())
    assert len(actions) == 2
    # ensure order was preserved (if this breaks it'll be intermittent)
    assert actions[0].organization == Organization.objects.get(
        classification="lower")
    assert actions[0].description == "introduced in house"
    assert actions[1].description == "sent to arbitrary committee"
    assert actions[1].related_entities.get(
    ).organization == Organization.objects.get(classification="committee")

    # action computed fields
    assert b.first_action_date == "1900-04-01"
    assert b.latest_action_date == "1900-04-04"
    assert b.latest_action_description == "sent to arbitrary committee"

    # related_bills were added
    rb = b.related_bills.get()
    assert rb.identifier == "HB 99"

    # and bill got resolved
    assert rb.related_bill.identifier == "HB 99"

    # sponsors added, linked & unlinked
    sponsorships = b.sponsorships.all()
    assert len(sponsorships) == 2
    person = Person.objects.get(name="Adam Smith")
    for ss in sponsorships:
        if ss.primary:
            assert ss.person is None
            assert ss.organization is None
        else:
            assert ss.person == person

    # versions & documents with their links
    versions = b.versions.all()
    assert len(versions) == 1
    assert versions[0].links.count() == 1
    documents = b.documents.all()
    assert len(documents) == 1
    assert documents[0].links.count() == 2

    # sources
    assert b.sources.count() == 1
Пример #10
0
    def scrape(self, session=None):
        if not session:
            session = self.latest_session()
            self.info("no session specified, using %s", session)

        # get member id matching for vote parsing
        member_ids = self.get_member_ids()[session]
        per_page = 10  # seems like it gives 10 no matter what.
        start_record = 0

        params = {
            "request": {
                "sEcho": 2,
                "iColumns": 4,
                "sColumns": "",
                "iDisplayStart": 0,
                "iDisplayLength": per_page,
                "mDataProp_0": "ShortTitle",
                "mDataProp_1": "Title",
                "mDataProp_2": "LegislationCategories",
                "mDataProp_3": "Modified",
                "iSortCol_0": 0,
                "sSortDir_0": "asc",
                "iSortingCols": 0,
                "bSortable_0": "true",
                "bSortable_1": "true",
                "bSortable_2": "true",
                "bSortable_3": "true",
            },
            "criteria": {
                "Keyword": "",
                "Category": "",
                "SubCategoryId": "",
                "RequestOf": "",
                "CouncilPeriod": str(session),
                "Introducer": "",
                "CoSponsor": "",
                "CommitteeReferral": "",
                "CommitteeReferralComments": "",
                "StartDate": "",
                "EndDate": "",
                "QueryLimit": 100,
                "FilterType": "",
                "Phases": "",
                "LegislationStatus": "0",
                "IncludeDocumentSearch": "false",
            },
        }
        param_json = json.dumps(params)
        response = api_request("/GetPublicAdvancedSearch", data=param_json)
        # the response is a terrible string-of-nested-json-strings. Yuck.
        response = response["d"]
        data = response["aaData"]

        while len(data) > 0:
            for bill in data:
                # sometimes they're in there more than once, so we'll keep track
                bill_id = bill["Title"]
                if bill_id.startswith("AG"):
                    # actually an agenda, skip
                    continue
                bill_params = {"legislationId": bill_id}
                bill_info = api_request("/GetPublicData",
                                        data=json.dumps(bill_params))
                bill_info = bill_info["d"]["data"]
                bill_source_url = "http://lims.dccouncil.us/Legislation/" + bill_id

                legislation_info = bill_info["Legislation"][0]
                title = legislation_info["ShortTitle"]

                if bill_id.startswith("R") or bill_id.startswith("CER"):
                    bill_type = "resolution"
                else:
                    bill_type = "bill"

                bill = Bill(
                    bill_id,
                    legislative_session=session,
                    title=title,
                    classification=bill_type,
                )

                # sponsors and cosponsors
                if "Introducer" in legislation_info:
                    introducers = legislation_info["Introducer"]
                else:
                    # sometimes there are introducers, sometimes not.
                    # Set Introducers to empty array to avoid downstream breakage,
                    # but log bills without introducers
                    self.logger.warning("No Introducer: {0}".format(
                        bill.identifier))
                    introducers = []

                try:
                    # sometimes there are cosponsors, sometimes not.
                    cosponsors = legislation_info["CoSponsor"]
                except KeyError:
                    cosponsors = []

                for i in introducers:
                    name = i["Name"]
                    # they messed up Phil Mendelson's name
                    if name == "Phil Pmendelson":
                        name = "Phil Mendelson"
                    bill.add_sponsorship(
                        name,
                        classification="primary",
                        entity_type="person",
                        primary=True,
                    )

                for s in cosponsors:
                    name = s["Name"]
                    if name == "Phil Pmendelson":
                        name = "Phil Mendelson"
                    bill.add_sponsorship(
                        name=name,
                        classification="cosponsor",
                        entity_type="person",
                        primary=False,
                    )

                # if it's become law, add the law number as an alternate title
                if "LawNumber" in legislation_info:
                    law_num = legislation_info["LawNumber"]
                    if law_num:
                        bill.add_title(law_num)

                # also sometimes it's got an act number
                if "ActNumber" in legislation_info:
                    act_num = legislation_info["ActNumber"]
                    if act_num:
                        bill.add_title(act_num)

                # sometimes AdditionalInformation has a previous bill name
                if "AdditionalInformation" in legislation_info:
                    add_info = legislation_info["AdditionalInformation"]
                    if "previously" in add_info.lower():
                        prev_title = (add_info.lower().replace(
                            "previously", "").strip().replace(" ", ""))
                        bill.add_title(prev_title.upper())
                    elif add_info:
                        bill.extras["additional_information"] = add_info

                if "WithDrawnDate" in legislation_info:
                    withdrawn_date = self.date_format(
                        legislation_info["WithDrawnDate"])
                    withdrawn_by = legislation_info["WithdrawnBy"][0][
                        "Name"].strip()
                    if withdrawn_by == "the Mayor":

                        bill.add_action(
                            "withdrawn",
                            withdrawn_date,
                            chamber="executive",
                            classification="withdrawal",
                        )

                    elif "committee" in withdrawn_by.lower():
                        a = bill.add_action("withdrawn",
                                            withdrawn_date,
                                            classification="withdrawal")
                        a.add_related_entity(withdrawn_by,
                                             entity_type="organization")
                    else:
                        a = bill.add_action("withdrawn",
                                            withdrawn_date,
                                            classification="withdrawal")
                        a.add_related_entity(withdrawn_by,
                                             entity_type="person")

                for action in bill_info["LegislationBillHistory"]:
                    action_name = action["Description"]
                    action_date = datetime.datetime.strptime(
                        action["ActionDate"], "%Y/%m/%d %H:%M:%S")
                    action_date = self._TZ.localize(action_date)
                    action_class = self.classify_action(action_name)

                    if "mayor" in action_name.lower():
                        actor = "executive"
                    else:
                        actor = "legislature"

                    a = bill.add_action(
                        action_name,
                        action_date,
                        classification=action_class,
                        chamber=actor,
                    )

                    if (action_class is not None
                            and "referral-committee" in action_class):
                        if "CommitteeReferral" in legislation_info:
                            committees = []
                            for committee in legislation_info[
                                    "CommitteeReferral"]:
                                if (committee["Name"].lower() ==
                                        "retained by the council"):
                                    committees = []
                                    break
                                else:
                                    committees.append(committee["Name"])
                            if committees != []:
                                for com in committees:
                                    a.add_related_entity(
                                        com, entity_type="organization")
                        if "CommitteeReferralComments" in legislation_info:
                            for committee in legislation_info[
                                    "CommitteeReferralComments"]:
                                a.add_related_entity(
                                    committee["Name"],
                                    entity_type="organization")

                # deal with actions involving the mayor
                mayor = bill_info["MayorReview"]
                if mayor != []:
                    mayor = mayor[0]

                    if "TransmittedDate" in mayor:
                        transmitted_date = self.date_format(
                            mayor["TransmittedDate"])

                    # if returned but not signed, it was vetoed
                    elif "ReturnedDate" in mayor:
                        veto_date = self.date_format(mayor["ReturnedDate"])

                        bill.add_action(
                            "vetoed",
                            veto_date,
                            chamber="executive",
                            classification="executive-veto",
                        )

                        # if it was returned and enacted but not signed, there was a veto override
                        if "EnactedDate" in mayor:
                            override_date = self.date_format(
                                mayor["EnactedDate"])

                            bill.add_action(
                                "veto override",
                                override_date,
                                classification="veto-override-passage",
                            )

                    if "AttachmentPath" in mayor:
                        # documents relating to the mayor's review
                        self.add_documents(mayor["AttachmentPath"], bill)

                congress = bill_info["CongressReview"]
                if len(congress) > 0:
                    congress = congress[0]
                    if "TransmittedDate" in congress:
                        transmitted_date = self.date_format(
                            congress["TransmittedDate"])

                        bill.add_action("Transmitted to Congress for review",
                                        transmitted_date)

                # deal with committee actions
                if "DateRead" in legislation_info:
                    date = legislation_info["DateRead"]
                elif "IntroductionDate" in legislation_info:
                    date = legislation_info["IntroductionDate"]
                else:
                    self.logger.warning(
                        "we can't find anything that looks like an "
                        "action date. Skipping")
                    continue
                date = self.date_format(date)

                # deal with random docs floating around
                docs = bill_info["OtherDocuments"]
                for d in docs:
                    if "AttachmentPath" in d:
                        self.add_documents(d["AttachmentPath"], bill)
                    else:
                        self.logger.warning(
                            "Document path missing from 'Other Documents'")

                if "MemoLink" in legislation_info:
                    self.add_documents(legislation_info["MemoLink"], bill)

                if "AttachmentPath" in legislation_info:
                    self.add_documents(legislation_info["AttachmentPath"],
                                       bill)

                # full council votes
                votes = bill_info["VotingSummary"]
                for vote in votes:
                    v = self.process_vote(vote, bill, member_ids)
                    if v:
                        v.add_source(bill_source_url)
                        yield v

                # deal with committee votes
                if "CommitteeMarkup" in bill_info:
                    committee_info = bill_info["CommitteeMarkup"]
                    if len(committee_info) > 0:
                        for committee_action in committee_info:
                            v = self.process_committee_vote(
                                committee_action, bill)
                            if v:
                                v.add_source(bill_source_url)
                                yield v
                        if "AttachmentPath" in committee_info:
                            self.add_documents(vote["AttachmentPath"], bill)

                bill.add_source(bill_source_url)
                yield bill

            # get next page
            start_record += per_page
            params["request"]["iDisplayStart"] = start_record
            param_json = json.dumps(params)
            response = api_request("/GetPublicAdvancedSearch", data=param_json)
            response = response["d"]
            data = response["aaData"]
Пример #11
0
    def scrape_bill(self, bill_num, session):
        chamber_map = {"House": "lower", "Senate": "upper", "LSO": "executive"}
        # Sample with all keys: https://gist.github.com/showerst/d6cd03eff3e8b12ab01dbb219876db45
        bill_json_url = (
            "http://wyoleg.gov/LsoService/api/BillInformation/{}/"
            "{}?calendarDate=".format(session, bill_num)
        )

        if self.is_special:
            bill_json_url = (
                "http://wyoleg.gov/LsoService/api/BillInformation/{}/"
                "{}?specialSessionValue=1&calendarDate=".format(session[0:4], bill_num)
            )

        try:
            response = self.get(bill_json_url)
            bill_json = json.loads(response.content.decode("utf-8"))
        except scrapelib.HTTPError:
            return None

        chamber = "lower" if bill_json["bill"][0] else "upper"

        bill = Bill(
            identifier=bill_json["bill"],
            legislative_session=session,
            title=bill_json["catchTitle"],
            chamber=chamber,
            classification="bill",
        )

        bill.add_title(bill_json["billTitle"])

        source_url = "http://lso.wyoleg.gov/Legislation/{}/{}".format(
            session, bill_json["bill"]
        )

        if self.is_special:
            source_url = "http://lso.wyoleg.gov/Legislation/{}/{}?specialSessionValue=1".format(
                session[0:4], bill_json["bill"]
            )

        bill.add_source(source_url)

        for action_json in bill_json["billActions"]:
            utc_action_date = self.parse_local_date(action_json["statusDate"])

            actor = None
            if action_json["location"] and action_json["location"] in chamber_map:
                actor = chamber_map[action_json["location"]]

            action = bill.add_action(
                chamber=actor,
                description=action_json["statusMessage"],
                date=utc_action_date,
                classification=categorize_action(action_json["statusMessage"]),
            )

            action.extras = {"billInformationID": action_json["billInformationID"]}

        if bill_json["introduced"]:
            url = "http://wyoleg.gov/{}".format(bill_json["introduced"])

            bill.add_version_link(
                note="Introduced",
                url=url,
                media_type="application/pdf",  # optional but useful!
            )

        if bill_json["enrolledAct"]:
            url = "http://wyoleg.gov/{}".format(bill_json["enrolledAct"])

            bill.add_version_link(
                note="Enrolled",
                url=url,
                media_type="application/pdf",  # optional but useful!
            )

        if bill_json["fiscalNote"]:
            url = "http://wyoleg.gov/{}".format(bill_json["fiscalNote"])

            bill.add_document_link(
                note="Fiscal Note",
                url=url,
                media_type="application/pdf",  # optional but useful!
            )

        if bill_json["digest"]:
            url = "http://wyoleg.gov/{}".format(bill_json["digest"])

            bill.add_document_link(
                note="Bill Digest",
                url=url,
                media_type="application/pdf",  # optional but useful!
            )

        if bill_json["vetoes"]:
            for veto in bill_json["vetoes"]:
                url = "http://wyoleg.gov/{}".format(veto["vetoLinkPath"])
                bill.add_version_link(
                    note=veto["vetoLinkText"],
                    url=url,
                    media_type="application/pdf",  # optional but useful!
                )

        for amendment in bill_json["amendments"]:
            # http://wyoleg.gov/2018/Amends/SF0050H2001.pdf
            # TODO: There are no special session amendments yet,
            # but check this url format for specials
            url = "http://wyoleg.gov/{}/Amends/{}.pdf".format(
                session[0:4], amendment["amendmentNumber"]
            )

            if amendment["sponsor"] and amendment["status"]:
                title = "Amendment {} ({}) - {} ({})".format(
                    amendment["amendmentNumber"],
                    amendment["order"],
                    amendment["sponsor"],
                    amendment["status"],
                )
            else:
                title = "Amendment {} ({})".format(
                    amendment["amendmentNumber"], amendment["order"]
                )
            # add versions of the bill text
            version = bill.add_version_link(
                note=title, url=url, media_type="application/pdf"
            )
            version["extras"] = {
                "amendmentNumber": amendment["amendmentNumber"],
                "sponsor": amendment["sponsor"],
            }

        for sponsor in bill_json["sponsors"]:
            status = "primary" if sponsor["primarySponsor"] else "cosponsor"
            sponsor_type = "person" if sponsor["sponsorTitle"] else "organization"
            bill.add_sponsorship(
                name=sponsor["name"],
                classification=status,
                entity_type=sponsor_type,
                primary=sponsor["primarySponsor"],
            )

        if bill_json["summary"]:
            bill.add_abstract(note="summary", abstract=bill_json["summary"])

        if bill_json["enrolledNumber"]:
            bill.extras["wy_enrolled_number"] = bill_json["enrolledNumber"]

        if bill_json["chapter"]:
            bill.extras["chapter"] = bill_json["chapter"]

        if bill_json["effectiveDate"]:
            eff = datetime.datetime.strptime(bill_json["effectiveDate"], "%m/%d/%Y")
            bill.extras["effective_date"] = eff.strftime("%Y-%m-%d")

        bill.extras["wy_bill_id"] = bill_json["id"]

        for vote_json in bill_json["rollCalls"]:
            yield from self.scrape_vote(bill, vote_json, session)

        yield bill