Пример #1
0
    def process_item(self, item):
        committee_name = item.text_content()

        # only scrape joint coms on senate scrape
        if ("Joint" in committee_name or "Task Force" in committee_name
                or "Conference" in committee_name):
            self.skip()

        committee_name = remove_comm(committee_name)
        committee_name = committee_name.strip()

        if "Subcommittee" in committee_name:
            name = committee_name.replace("Subcommittee on ",
                                          "").replace(", Subcommittee", "")

            parent = remove_comm(
                XPath("..//..//preceding-sibling::a").match(item)
                [0].text_content())

            com = ScrapeCommittee(
                name=name,
                chamber=self.chamber,
                classification="subcommittee",
                parent=parent,
            )
        else:
            com = ScrapeCommittee(name=committee_name, chamber=self.chamber)

        # We can construct a URL that would make scraping easier, as opposed to the link that is directly given
        comm_link = item.get("href").replace("https://www.house.mo.gov/", "")
        source = f"https://www.house.mo.gov/MemberGridCluster.aspx?filter=compage&category=committee&{comm_link}"
        return HouseCommitteeDetail(com, source=URL(source, timeout=30))
Пример #2
0
    def process_item(self, item):
        try:
            title = XPath("..//preceding-sibling::h3/text()").match(item)

        except SelectorError:
            title = XPath("../../..//preceding-sibling::h3/text()").match(item)

        for comm_name in title:
            if (comm_name == "Standing Committees"
                    or comm_name == "Appropriations Subcommittees"):
                name_link = CSS("a").match_one(item)
                name = name_link.text_content()
                source = name_link.get("href")
                if comm_name == "Standing Committees":
                    com = ScrapeCommittee(name=name, chamber=self.chamber)
                else:
                    com = ScrapeCommittee(
                        name=name,
                        classification="subcommittee",
                        chamber=self.chamber,
                        parent="Appropriations",
                    )
                return SenateCommitteeDetail(com, source=source)
            else:
                self.skip()
Пример #3
0
    def process_item(self, item):
        comm_name = CSS("a").match_one(item).text_content()
        comm_url = CSS("a").match_one(item).get("href")

        # "https://jtlegbudget.legislature.ca.gov/sublegislativeanalyst" has no members
        if comm_url == "https://jtlegbudget.legislature.ca.gov/sublegislativeanalyst":
            self.skip()

        # Joint Committees are being skipped to avoid duplicates (they were already grabbed during SenateCommitteeList())
        if comm_name.startswith("Joint Committee") or comm_name.startswith(
                "Joint Legislative"):
            self.skip()
        elif comm_name.startswith("Subcommittee"):
            parent_comm = item.getparent().getparent().getchildren(
            )[0].text_content()
            com = ScrapeCommittee(
                name=comm_name,
                classification="subcommittee",
                chamber="lower",
                parent=parent_comm,
            )
        else:
            com = ScrapeCommittee(name=comm_name,
                                  classification="committee",
                                  chamber="lower")
        com.add_source(self.source.url)
        com.add_source(comm_url)
        com.add_link(comm_url, note="homepage")
        return ChooseType(com, source=URL(comm_url))
Пример #4
0
    def process_page(self):
        name = self.root.xpath('//h2[@class="committeeName"]')[1].text
        if name.startswith("Appropriations Subcommittee"):
            return
            # TODO: restore scraping of Appropriations Subcommittees
            # name = name.replace('Appropriations ', '')
            # parent = {'name': 'Appropriations', 'classification': 'upper'}
            # chamber = None
        else:
            if name.startswith("Committee on"):
                name = name.replace("Committee on ", "")
            parent = None
            chamber = "upper"
        print(name)
        comm = ScrapeCommittee(name=name,
                               classification="committee",
                               chamber=chamber,
                               parent=parent)

        for dt in self.root.xpath('//div[@id="members"]/dl/dt'):
            role = dt.text.replace(": ", "").strip().lower()
            member = dt.xpath("./following-sibling::dd")[0].text_content()
            member = self.clean_name(member)
            comm.add_member(member, role=role)

        for ul in self.root.xpath('//div[@id="members"]/ul/li'):
            member = self.clean_name(ul.text_content())
            comm.add_member(member)

        comm.add_source(self.source.url)

        return comm
Пример #5
0
    def process_item(self, item):
        link = (XPath(
            ".//div[contains(@class, 'container')]//a[contains(@href, 'members')]"
        ).match(item)[0].get("href"))
        name = CSS("h2 a").match(item)[0].text_content()
        com = ScrapeCommittee(name=name, chamber=self.chamber)

        for links in XPath(".//div[contains(@class, 'container')]//a").match(
                item):
            url = links.get("href")
            if url == link:
                continue
            else:
                if links == XPath(
                        ".//div[contains(@class, 'container')]//a[contains(@href, 'home')]"
                ).match_one(item):
                    com.add_link(url, note="homepage")
                    homepage = True
                else:
                    com.add_link(url)
        if not homepage:
            self.warn("no homepage found")

        com.add_source(self.source.url)
        return HouseCommitteeDetail(com, source=link)
Пример #6
0
    def process_item(self, item):
        comm_name = CSS("a").match(item)[0].text_content().strip()

        previous_sibs = item.getparent().itersiblings(preceding=True)
        for sib in previous_sibs:
            if len(sib.getchildren()) == 0:
                chamber_type = sib.text_content().strip()
                break

        if chamber_type == "Senate Committees":
            chamber = "upper"
        elif chamber_type == "Joint Committees":
            self.skip()
        elif chamber_type == "Task Forces":
            self.skip()

        com = ScrapeCommittee(
            name=comm_name,
            classification="committee",
            chamber=chamber,
        )

        detail_link = CSS("a").match(item)[0].get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        return SenDetail(com, source=detail_link)
Пример #7
0
 def process_item(self, item):
     return CommitteeDetail(
         ScrapeCommittee(
             name=item.text_content(),
             chamber=self.chamber,
         ),
         source=item.get("href"),
     )
Пример #8
0
    def process_item(self, item):

        name = CSS("div span.bTiles__title").match(item)[0].text_content()
        com = ScrapeCommittee(name=name,
                              classification="committee",
                              chamber=self.chamber)
        detail_link = item.get("href")
        return SenateCommitteeDetail(com, source=detail_link)
Пример #9
0
    def process_item(self, item):
        comm_name = (
            item.text_content().strip().split(" (")[0].title().replace(
                "(Fin Sub)", ""))

        if "Conference" in comm_name:
            self.skip()

        chamber = item.getparent().getprevious().getprevious().text_content(
        ).strip()
        if chamber == "House":
            chamber = "lower"
        elif chamber == "Senate":
            chamber = "upper"
        elif chamber == "Joint Committee":
            chamber = "legislature"

        classification = item.getparent().getprevious().text_content().strip()

        if classification == "Finance Subcommittee":
            # work around duplicate name of Judiciary committees
            # a current limitation in how Open States can handle committees
            # see https://github.com/openstates/issues/issues/598
            if comm_name == "Judiciary":
                comm_name = "Judiciary (Finance)"
            com = ScrapeCommittee(
                name=comm_name,
                classification="subcommittee",
                chamber=chamber,
                parent="Finance",
            )
        else:
            com = ScrapeCommittee(
                name=comm_name,
                classification="committee",
                chamber=chamber,
            )

        detail_link = CSS("a").match_one(item).get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        return CommiteeDetail(com, source=URL(detail_link, timeout=30))
Пример #10
0
class CommitteeDetail(HtmlPage):
    example_source = "https://www.legis.state.pa.us/cfdocs/CteeInfo/index.cfm?Code=32&CteeBody=H&SessYear=2021"
    example_name = "Aging & Older Adult Services"
    example_input = ScrapeCommittee(name=example_name,
                                    classification="committee",
                                    chamber="lower")

    def process_page(self):
        com = self.input
        try:
            # This section has the chair memebers the regular, democratic and minority and the roles
            # main chair
            chair_member = (CSS(
                "div.MemberInfoList-MemberWrapper.ChairWrapper div.ChairNameText a"
            ).match(self.root)[0].text.strip())
            # main chair role
            chair_member_role = (CSS(
                "div.MemberInfoList-MemberWrapper.ChairWrapper div.ChairNameText div"
            ).match(self.root)[0].text.strip())
        except IndexError:
            pass
        try:
            com.add_member(fix_name(chair_member), chair_member_role)
            # Democratic Chair member and or the minority chair member
            demo_chair_member = (CSS(
                "div.MemberInfoList-MemberWrapper.ChairWrapper div.ChairNameText a"
            ).match(self.root)[1].text.strip())
            # Democratic Chair member and or the minority chair member role
            demo_chair_member_role = (CSS(
                "div.MemberInfoList-MemberWrapper.ChairWrapper div.ChairNameText div"
            ).match(self.root)[1].text.strip())
            com.add_member(fix_name(demo_chair_member), demo_chair_member_role)
        except IndexError:
            pass
        majority_members = CSS(
            ".Widget.CteeInfo-MajorityList .MemberInfoList-MemberWrapper.Member"
        ).match(self.root)
        for mem in majority_members:
            try:
                major_member_name = CSS("div a").match_one(mem).text.strip()
                major_mem_position = CSS(".position").match_one(
                    mem).text.strip()
            except SelectorError:
                major_mem_position = "member"
            com.add_member(fix_name(major_member_name), major_mem_position)
        minority_members = CSS(
            ".Widget.CteeInfo-MinorityList .MemberInfoList-MemberWrapper.Member"
        ).match(self.root)
        for mem in minority_members:
            try:
                minor_member_name = CSS("div a").match_one(mem).text.strip()
                minor_mem_position = CSS(".position").match_one(
                    mem).text.strip()
            except SelectorError:
                minor_mem_position = "member"
            com.add_member(fix_name(minor_member_name), minor_mem_position)
        return com
Пример #11
0
    def process_item(self, item):
        comm_name = (
            item.text_content().strip().split(" (")[0].title().replace("(Fin Sub)", "")
        )

        if "Conference" in comm_name:
            self.skip()

        chamber = item.getparent().getprevious().getprevious().text_content().strip()
        if chamber == "House":
            chamber = "lower"
        elif chamber == "Senate":
            chamber = "upper"
        elif chamber == "Joint Committee":
            chamber = "legislature"

        classification = item.getparent().getprevious().text_content().strip()

        if classification == "Finance Subcommittee":
            com = ScrapeCommittee(
                name=comm_name,
                classification="subcommittee",
                chamber=chamber,
                parent="Finance",
            )
        else:
            com = ScrapeCommittee(
                name=comm_name,
                classification="committee",
                chamber=chamber,
            )

        detail_link = CSS("a").match_one(item).get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        return CommiteeDetail(com, source=URL(detail_link, timeout=30))
Пример #12
0
    def process_item(self, item):
        name = item.text_content()
        if "Joint" in name:
            chamber = "legislature"
        else:
            chamber = "upper"

        if (name != "2021 Senate Committee Hearing Schedule"
                and name != "Assigned Bills" and name != "Committee Minutes"
                and name != "Appointees To Be Considered"):
            if "Committee" in name:

                comm_name = (name.replace(
                    "Joint Committee on the ",
                    "").replace("Joint Committee on ",
                                "").replace("Committee on ",
                                            "").replace(" Committee", ""))

                if "Subcommittee" in name:
                    name_parent = comm_name.split(" – ")
                    parent = name_parent[0]
                    comm_name = name_parent[1].replace("Subcommittee", "")

                    com = ScrapeCommittee(
                        name=comm_name,
                        chamber=chamber,
                        classification="subcommittee",
                        parent=parent,
                    )
                else:
                    com = ScrapeCommittee(name=comm_name, chamber=chamber)
            else:
                com = ScrapeCommittee(name=name, chamber=chamber)

            return SenateCommitteeDetail(com,
                                         source=URL(item.get("href"),
                                                    timeout=30))
        else:
            self.skip()
Пример #13
0
    def process_item(self, item):
        comm_name = item.text_content().strip()

        com = ScrapeCommittee(
            name=comm_name.title(),
            classification="committee",
            chamber="legislature",
        )

        detail_link = item.get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        return HouseJointDetail(com, source=detail_link)
Пример #14
0
 def process_item(self, item):
     name = item.text_content().strip()
     com = ScrapeCommittee(name=name,
                           classification="committee",
                           chamber=self.chamber)
     detail_link = item.get("href")
     com.add_source(detail_link)
     com.add_link(detail_link, "homepage")
     return CommitteeDetail(com, source=detail_link)
Пример #15
0
    def process_item(self, item):
        if item["chamber"] == 2:
            chamber = "upper"
        elif item["chamber"] == 1:
            chamber = "lower"

        source = URL(
            f"https://www.legis.ga.gov/api/committees/details/{item['id']}/1029",
            headers={"Authorization": get_token()},
        )

        com = ScrapeCommittee(
            name=item["name"],
            chamber=chamber,
        )

        com.add_source(
            self.source.url, note="Initial list page (requires authorization token)"
        )

        return CommitteeDetail(
            com,
            source=source,
        )
Пример #16
0
 def process_item(self, item):
     com_link = CSS("a").match(item)[0]
     name = com_link.text_content()
     com = ScrapeCommittee(name=name,
                           classification="committee",
                           chamber=self.chamber)
     detail_link = com_link.get("href")
     com.add_source(detail_link)
     com.add_link(detail_link, note="homepage")
     return CommitteeDetail(com, source=detail_link)
Пример #17
0
class HouseCommitteeDetail(HtmlPage):
    example_source = "https://www.house.mi.gov/Committee/AGRI"
    example_input = ScrapeCommittee(name="Agriculture", chamber="lower")

    def process_page(self):
        com = self.input
        com.add_source(self.source.url)
        com.add_link(self.source.url, note="homepage")

        member_links = CSS(".mb40 li a").match(self.root)

        for link in member_links:
            if link.text.startswith("Rep."):
                title = link.getnext().text_content().strip()
                name = link.text.split("(")[0].replace("Rep. ", "")
                com.add_member(name, title or "member")

        return com
Пример #18
0
    def process_item(self, item):
        com_link = CSS("a").match_one(item)
        name = com_link.text_content()

        com = ScrapeCommittee(
            name=name,
            chamber=self.chamber,
        )

        detail_link = com_link.get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        # this link has broken html (not able to grab member info)
        # just returning name, chamber, and link
        if detail_link == "https://legislature.idaho.gov/sessioninfo/2021/joint/cec/":
            return com

        return DetailCommitteePage(com, source=detail_link)
Пример #19
0
    def process_item(self, item):
        comm_name = item.text_content().strip()

        com = ScrapeCommittee(
            name=comm_name,
            classification="committee",
            chamber=self.chamber,
        )

        detail_link = item.get("href")

        com.add_source(self.source.url)

        # detail links for Joint Committees are hidden
        # "javascript:__doPostBack('ctl00$ContentPlaceHolder1$gvJICommittees','cmdCommittee$0')"
        if self.chamber != "legislature":
            com.add_source(detail_link)
            com.add_link(detail_link, note="homepage")

            return CommDetail(com, source=detail_link)
        else:
            raise SkipItem("joint committee")
Пример #20
0
    def process_item(self, item):

        name = item.text_content().strip()

        chamber = (
            item.getparent()
            .getparent()
            .getparent()
            .getprevious()
            .text_content()
            .strip()
            .split()[0]
        )
        if chamber == "House":
            chamber = "lower"
        elif chamber == "Senate":
            chamber = "upper"
        elif chamber == "Joint":
            chamber = "legislature"
        elif chamber == "Legislative":
            self.skip()
            # skipping Legislative Agencies

        com = ScrapeCommittee(
            name=name,
            chamber=chamber,
        )

        com.add_source(self.source.url)

        # new source
        href = item.get("href")
        href_lst = href.split("/")
        new_source = f"https://app.leg.wa.gov/ContentParts/CommitteeMembers/?agency={href_lst[-3]}&committee={href_lst[-1]}"

        com.add_source(new_source)
        com.add_link(href, note="homepage")

        return CommitteeDetail(com, source=new_source)
Пример #21
0
    def process_item(self, item):
        name = item.text_content().strip()
        if re.search(" - ", name):
            parent, com_name = name.split(" - Subcommittee on ")
            com = ScrapeCommittee(
                name=com_name,
                classification="subcommittee",
                parent=parent,
                chamber=self.chamber,
            )
        else:
            com = ScrapeCommittee(name=name, chamber=self.chamber)

        com.add_source(self.source.url)
        return SenateCommitteeDetail(com, source=item.get("href"))
Пример #22
0
    def process_item(self, item):
        sub_name = item.text_content().strip()

        parent = (item.getparent().getparent().getparent().getparent().
                  getchildren()[0].text_content().strip())

        if parent.title() == "Alc-Jbc Budget Hearings":
            self.skip()

        com = ScrapeCommittee(
            name=sub_name.title(),
            classification="subcommittee",
            chamber="legislature",
            parent=parent.title(),
        )

        detail_link = item.get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        return HouseJointDetail(com, source=detail_link)
Пример #23
0
    def process_page(self):
        # don't use list page because we need to look back at prior element
        parent = None
        chamber = "lower"

        for item in self.selector.match(self.root):
            cssclass = item.attrib.get("class", "")
            name = item.text_content().strip()

            if "parentcommittee" in cssclass:
                parent = None
                chamber = "lower"

            comm = ScrapeCommittee(name=name,
                                   classification="committee",
                                   chamber=chamber,
                                   parent=parent)
            yield HouseComDetail(comm, source=item.attrib["href"])

            # parent for next time
            if "parentcommittee" in cssclass:
                parent = comm._id
                chamber = None
Пример #24
0
    def process_item(self, item):
        name = item.text_content()
        if re.search(" - ", name):
            parent, name = name.split(" - ")

            # there is one subcommittee that has a shortened parent called "Approps."
            if parent == "Approps.":
                parent = "Appropriations"
            committee = ScrapeCommittee(
                name=name,
                classification="subcommittee",
                parent=parent,
                chamber=self.chamber,
            )
        else:
            committee = ScrapeCommittee(name=name, chamber=self.chamber)

        committee.add_source(self.source.url)
        return CommitteeDetail(committee, source=item.get("href"))
Пример #25
0
 def process_item(self, item):
     name = item.text_content().strip()
     com = ScrapeCommittee(name=name, chamber=self.chamber)
     com.add_source(self.source.url)
     return SenateCommitteeDetail(com, source=item.get("href"))
Пример #26
0
    def process_item(self, item):

        name = item["CommitteeName"]
        chamber = item["LegislativeBody"]

        if chamber == "H":
            chamber = "lower"
        elif chamber == "S":
            chamber = "upper"
        else:
            # a few Ad Hoc Committees don't have chambers, but are not included in the Standing Committees Scrape anyway
            self.logger.warning("Committee not assigned to chamber")
            chamber = "lower"

        if item["IsSubCommittee"] is False:
            com = ScrapeCommittee(name=name, chamber=chamber)

        else:

            try:
                parent, name = name.split(" Subcommittee on ")
            except ValueError:
                self.logger.warning(f"No parent listed for {name}")

            com = ScrapeCommittee(
                name=name,
                classification="subcommittee",
                chamber=chamber,
                parent=parent,
            )

        members = []
        for member in item["Members"]:

            name = member["FirstName"] + " " + member["LastName"]
            if member["IsChair"]:
                position = "Chair"
            elif member["IsViceChair"]:
                position = "Vice Chair"
            else:
                position = "member"

            # As of now, the API lists all members twice, so we must check for duplicates for members
            if f"{name} {position}" in members:
                continue
            else:
                members.append(f"{name} {position}")
                com.add_member(name, position)

        com.extras["Committee ID"] = item["CommitteeId"]
        com.extras["Committee Short Name"] = item["CommitteeShortName"]
        com.extras["Committee Type"] = item["TypeName"]

        com.add_source(self.source.url)

        return com
Пример #27
0
 def process_item(self, item):
     name = item.text_content().strip()
     com = ScrapeCommittee(name=name, chamber=self.chamber)
     return HouseCommitteeDetail(com, source=item.get("href"))
Пример #28
0
 def process_item(self, item):
     name = item.text_content()
     com = ScrapeCommittee(name=name, chamber=self.chamber)
     com.add_source(self.source.url)
     return HouseCommitteeDetail(com, source=URL(item.get("href"), timeout=30))
Пример #29
0
    def process_item(self, item):
        comm_name = XPath("text()").match_one(item)
        if comm_name in [
                "Teleconference How-To Information", "Legislative Process"
        ]:
            self.skip()

        comm_url = XPath("@href").match_one(item)

        if comm_name.startswith("Joint"):
            com = ScrapeCommittee(name=comm_name,
                                  classification="committee",
                                  chamber="legislature")
        elif comm_name.startswith("Subcommittee"):
            parent_comm = (item.getparent().getparent().getparent().getparent(
            ).getchildren()[0].text_content())
            com = ScrapeCommittee(
                name=comm_name,
                classification="subcommittee",
                chamber="upper",
                parent=parent_comm,
            )
        else:
            com = ScrapeCommittee(name=comm_name,
                                  classification="committee",
                                  chamber="upper")
        com.add_source(self.source.url)
        com.add_source(comm_url)
        com.add_link(comm_url, note="homepage")
        return ChooseType(com, source=URL(comm_url))
Пример #30
0
    def process_item(self, item):
        name = CSS("strong").match(item)[0].text_content()

        # skip header row
        if name == "Committees":
            self.skip()

        com = ScrapeCommittee(
            name=name,
            chamber=self.chamber,
        )

        all_text = CSS("p").match(item)[0].text_content().strip()
        secretary, email, phone = re.search(
            r"\n?Secretary:(.+)\n?Email:(.+)\n?Phone:(.+)", all_text
        ).groups()
        com.extras["secretary"] = secretary.strip()
        com.extras["email"] = email.strip()
        com.extras["phone"] = phone.strip()

        detail_link = CSS("a").match(item)[0].get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        return DetailCommitteePage(com, source=detail_link)