Exemplo n.º 1
0
class HouseDetail(HtmlPage):
    image_selector = SimilarLink(
        "https://www.okhouse.gov/Members/Pictures/HiRes/")
    prefix = "#ctl00_ContentPlaceHolder1_lbl"
    name_css = CSS(prefix + "Name")
    district_css = CSS(prefix + "District")
    party_css = CSS(prefix + "Party")

    def process_page(self):
        name = self.name_css.match_one(self.root).text.split(maxsplit=1)[1]
        p = Person(
            name=name,
            state="ok",
            chamber="upper",
            party=self.party_css.match_one(self.root).text,
            district=self.district_css.match_one(self.root).text.split()[1],
        )
        p.image = self.image_selector.match_one(self.root).get("href")

        contact_url = self.source.url.replace("District.aspx", "Contact.aspx")
        assert contact_url.startswith(
            "https://www.okhouse.gov/Members/Contact.aspx?District=")
        p.add_link(contact_url, note="Contact Form")

        # capitol address
        check_capitol_address = CSS(".districtheadleft").match(
            self.root)[0].text_content().strip()
        if check_capitol_address == "Capitol Address:":
            capitol_address_div = (CSS(".districtheadleft + div").match(
                self.root)[0].text_content().strip().splitlines())
            p.capitol_office.address = "; ".join(
                [ln.strip() for ln in capitol_address_div[:-1]])
            p.capitol_office.phone = capitol_address_div[-1].strip()
        return p
Exemplo n.º 2
0
    def process_item(self, item):
        name = CSS("div a").match(item)[1].text_content()
        district = (
            CSS("div .esg-content.eg-senators-grid-element-1").match_one(
                item).text_content().split("|")[1].strip().lower())
        district = re.search(r"district\s(\d+)", district).groups()[0]
        img = CSS("div img").match_one(item).get("data-lazysrc")

        p = ScrapePerson(
            name=name,
            state="in",
            chamber=self.chamber,
            district=district,
            party=self.party,
            image=img,
        )

        city = (CSS("div .esg-content.eg-senators-grid-element-27").match_one(
            item).text_content())
        p.extras["city"] = city

        detail_link = CSS("div a").match(item)[1].get("href")
        p.add_link(detail_link, note="homepage")
        p.add_source(self.source.url)
        p.add_source(detail_link)
        return BlueSenDetail(p, source=detail_link)
Exemplo n.º 3
0
    def process_item(self, item):
        name = CSS("header").match_one(item).text_content()
        district = CSS("div.district").match_one(item).text_content()
        district = re.search(r"House\sDistrict\s(\d+)", district).groups()[0]

        img = CSS("img").match_one(item).get("src")

        p = ScrapePerson(
            name=name,
            state="in",
            chamber=self.chamber,
            district=district,
            party=self.party,
            image=img,
        )

        p.extras["city"] = CSS("div.city").match_one(item).text_content()

        detail_link = item.get("href")
        p.add_link(detail_link, note="homepage")
        detail_link_full = detail_link + "/full"
        p.add_source(detail_link_full)

        p.add_source(self.source.url)

        return BlueRepDetail(p, source=detail_link_full)
Exemplo n.º 4
0
    def process_item(self, item):
        name_party = CSS("span").match(item)[0].text_content().strip().split(
            " - ")
        name = name_party[0].strip()
        party = name_party[1].strip()
        if party == "(D)":
            party = "Democratic"
        elif party == "(R)":
            party = "Republican"
        elif party == "(DTS)":
            party = "Independent"

        district = CSS("span").match(item)[1].text_content().strip()
        district = re.search(r"District:\s(.+)", district).groups()[0].strip()

        p = ScrapePerson(
            name=name,
            state="nm",
            chamber=self.chamber,
            district=district,
            party=party,
        )

        detail_link = CSS("a").match_one(item).get("href")

        p.add_source(self.source.url)
        p.add_source(detail_link)
        p.add_link(detail_link, note="homepage")

        img = CSS("img").match_one(item).get("src")
        p.image = img

        return LegDetail(p, source=detail_link)
Exemplo n.º 5
0
    def process_item(self, item):
        name = CSS("strong").match(item)[0].text_content()

        # skip header row
        if name == "Committees":
            self.skip()

        com = ScrapeCommittee(
            name=name,
            chamber=self.chamber,
        )

        all_text = CSS("p").match(item)[0].text_content().strip()
        secretary, email, phone = re.search(
            r"\n?Secretary:(.+)\n?Email:(.+)\n?Phone:(.+)", all_text
        ).groups()
        com.extras["secretary"] = secretary.strip()
        com.extras["email"] = email.strip()
        com.extras["phone"] = phone.strip()

        detail_link = CSS("a").match(item)[0].get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        return DetailCommitteePage(com, source=detail_link)
Exemplo n.º 6
0
    def process_page(self):
        com = self.input

        # no members
        if (
            CSS("div.Membership fieldset").match_one(self.root).text_content().strip()
            == ""
        ):
            raise SkipItem("empty committee")

        members = CSS("fieldset div.area-holder ul.list li span.col01").match(self.root)

        num_members = 0
        for member in members:
            role = member.getnext().text_content().strip()
            # skip Public Members
            if role == "Public Member":
                continue

            if role == "Member":
                role = "member"

            num_members += 1
            mem_name = CSS("span span").match_one(member).text_content().strip()
            mem_name = re.search(r"(Representative|Senator)\s(.+)", mem_name).groups()[
                1
            ]

            com.add_member(mem_name, role)

        if not num_members:
            raise SkipItem("only public members")

        return com
Exemplo n.º 7
0
    def process_page(self):
        com = self.input
        com.add_source(self.source.url)
        com.add_link(self.source.url, note="homepage")

        # a few committees don't have chair positions
        try:
            chair_role = (CSS(".c-chair-block--position").match_one(
                self.root).text_content().lower())
            chair_name = CSS(".c-chair--title").match_one(
                self.root).text_content()
            com.add_member(chair_name, chair_role)

        except SelectorError:
            pass
        try:
            for p in XPath(
                    "//div[contains(@class, 'c-senators-container')]//div[@class='view-content']/div[contains(@class, 'odd') or contains(@class, 'even')]"
            ).match(self.root):
                name = CSS(".nys-senator--name").match_one(p).text_content()

                role = CSS(".nys-senator--position").match_one(
                    p).text_content().lower()
                if role == "":
                    role = "member"

                com.add_member(name, role)
        except SelectorError:
            pass

        return com
Exemplo n.º 8
0
    def process_item(self, item):
        try:
            name = name_title = CSS("a").match(item)[0].text_content()
        except SelectorError:
            self.skip("header row")

        if "--" in name_title:
            name, title = [word.strip() for word in name.split("--")]

        _, district, party, email, room, capitol_phone = item.getchildren()

        district = district.text_content()

        party = party.text_content()
        if party == "R":
            party = "Republican"
        elif party == "D":
            party = "Democratic"

        email = email.text_content()
        if email.startswith("Email: "):
            email = email.replace("Email: ", "").lower() + "@azleg.gov"
        else:
            email = ""

        room = room.text_content()
        if self.chamber == "lower":
            address = "House of Representatives\n "
        elif self.chamber == "upper":
            address = "Senate\n "
        address = address + "1700 West Washington\n " + room + "\nPhoenix, AZ 85007"

        capitol_phone = capitol_phone.text_content()

        image = CSS("td a img").match(item)
        if image:
            image = image[0].get("src")

        p = ScrapePerson(
            name=name,
            state="az",
            chamber=self.chamber,
            district=district,
            party=party,
            email=email,
            image=image,
        )

        p.capitol_office.address = address
        p.capitol_office.voice = capitol_phone
        p.add_source(self.source.url)
        p.add_link(CSS("a").match(item)[0].get("href"))

        if "--" in name_title:
            p.extras["title"] = title
        return p
Exemplo n.º 9
0
 def process_item(self, item):
     com_link = CSS("a").match(item)[0]
     name = com_link.text_content()
     com = ScrapeCommittee(name=name,
                           classification="committee",
                           chamber=self.chamber)
     detail_link = com_link.get("href")
     com.add_source(detail_link)
     com.add_link(detail_link, note="homepage")
     return CommitteeDetail(com, source=detail_link)
Exemplo n.º 10
0
    def process_page(self):
        p = self.input

        img = CSS("div.field-person-photo img").match_one(self.root).get("src")
        p.image = img

        bio_info = CSS("div.pane-content ul li").match(self.root)
        if len(bio_info) > 0:
            p.extras["bio info"] = []
            for info in bio_info:
                p.extras["bio info"] += info

        try:
            street = (CSS("div.street-address").match_one(
                self.root).text_content().strip())
            town = CSS("span.locality").match_one(
                self.root).text_content().strip()
            zip_code = (CSS("span.postal-code").match_one(
                self.root).text_content().strip())
            address = street + ", " + town + ", ND " + zip_code
            p.district_office.address = address
        except SelectorError:
            pass

        try:
            phones = XPath(
                "//*[@id='block-system-main']//div[contains(text(), 'phone')]"
            ).match(self.root)
            for phone in phones:
                phone_type = phone.text_content().strip()
                phone_number = phone.getnext().text_content().strip()
                if phone_type == "Cellphone:":
                    p.extras["Cell phone"] = phone_number
                elif phone_type == "Home Telephone:":
                    p.extras["Home phone"] = phone_number
                elif phone_type == "Office Telephone:":
                    p.district_office.voice = phone_number
        except SelectorError:
            pass

        email = (XPath(
            "//*[@id='block-system-main']//div[contains(text(), 'Email')]").
                 match_one(self.root).getnext().text_content().strip())
        p.email = email

        try:
            fax = (XPath(
                "//*[@id='block-system-main']//div[contains(text(), 'Fax')]").
                   match_one(self.root).getnext().text_content().strip())
            p.district_office.fax = fax
        except SelectorError:
            pass

        return p
Exemplo n.º 11
0
    def process_page(self):
        p = self.input

        img = CSS("div#content p img").match_one(self.root).get("src")
        p.image = img

        if self.source.url == "https://legislature.maine.gov/District-22":
            addr = CSS("div#content p strong").match(self.root)[2].tail.strip()
        else:
            addr = (
                CSS("div#content p strong")
                .match(self.root)[1]
                .tail.strip()
                .lstrip(":")
                .strip()
            )
        if addr != p.district_office.address:
            p.extras["Additional address"] = addr

        try:
            state_phone = (
                XPath("//*[@id='content']/p/strong[contains(text(), 'State')]")
                .match_one(self.root)
                .tail.strip()
            )
            state_phone = state_phone.lstrip(":").strip()
            p.capitol_office.voice = state_phone
        except SelectorError:
            pass

        try:
            state_phone = (
                XPath("//*[@id='content']/p/b[contains(text(), 'State')]")
                .match_one(self.root)
                .tail.strip()
            )
            state_phone = state_phone.lstrip(":").strip()
            p.capitol_office.voice = state_phone
        except SelectorError:
            pass

        website = (
            XPath("//*[@id='content']/p/strong[contains(text(), 'Website')]")
            .match_one(self.root)
            .getnext()
        )
        if website.get("href") is None:
            website = website.getnext().get("href")
        else:
            website = website.get("href")
        p.add_link(website, note="website")

        return p
Exemplo n.º 12
0
    def process_page(self):
        com = self.input
        com.add_source(self.source.url)
        com.add_link(self.source.url, note="homepage")

        try:
            chairs = CSS(".chair-info").match(self.root)
        except SelectorError:
            raise SkipItem("skipping committee without full information")

        # in case there are co-chairs
        num_chairs = len(chairs)

        for chair in chairs:
            chair_name = CSS(".comm-chair-name").match_one(chair).text_content().strip()
            chair_role = (
                XPath(f"..//preceding-sibling::header[{num_chairs}]")
                .match_one(chair)
                .text_content()
                .strip()
                .lower()
            )
            com.add_member(chair_name, chair_role)

        # some committees only have chairs and no members list
        try:
            for p in CSS("#comm-membership ul li").match(self.root):
                name = p.text_content().strip()
                role = "member"
                com.add_member(name, role)
        except SelectorError:
            pass

        # some committees have temporary addresses, others have permanent ones
        try:
            temp, room, zip = XPath(
                "//section[@id='comm-addr']/div[@class='mod-inner']//text()"
            ).match(self.root)
            com.extras["address"] = f"{temp}: {room}; {zip}"
        except ValueError:
            room, zip = XPath(
                "//section[@id='comm-addr']/div[@class='mod-inner']//text()"
            ).match(self.root)
            com.extras["address"] = f"{room}; {zip}"

        # some committees have press releases
        try:
            news_link = CSS("#page-content .read-more").match(self.root)[0].get("href")
            com.add_link(news_link)
        except SelectorError:
            pass

        return com
Exemplo n.º 13
0
    def process_page(self):
        chamber = "upper" if self.input.identifier.startswith("S") else "lower"
        short_title = self.get_column_div("Summary").text
        long_title = CSS("#title").match_one(self.root).text

        if "*" in self.input.identifier:
            stars = re.search(r"\*+", self.input.identifier).group()
            if (
                self.input.session in CARRYOVERS
                and stars in CARRYOVERS[self.input.session]
            ):
                self.input.identifier = re.sub(
                    r"\*+",
                    "-" + CARRYOVERS[self.input.session][stars],
                    self.input.identifier,
                )
            else:
                self.logger.error(
                    f"Unidentified carryover bill {self.input.identifier}. Update CARRYOVERS dict in bills.py"
                )
                return

        bill = Bill(
            identifier=self.input.identifier,
            legislative_session=self.input.session,
            title=short_title,
            chamber=chamber,
        )
        bill.subject = self.input.subjects
        # use the pretty source URL
        bill.add_source(self.input.source_url)
        bill.add_title(long_title)

        try:
            sponsors = self.get_column_div("Primary Sponsor")
            self.add_sponsors(bill, CSS("a").match(sponsors), primary=True)
        except SelectorError:
            pass
        try:
            cosponsors = self.get_column_div("Co-Sponsor")
            self.add_sponsors(bill, CSS("a").match(cosponsors), primary=False)
        except SelectorError:
            pass
        # TODO: figure out cosponsor div name, can't find any as of Feb 2021
        self.add_actions(bill, chamber)

        bdr = extract_bdr(short_title)
        if bdr:
            bill.extras["BDR"] = bdr

        text_url = self.source.url.replace("Overview", "Text")
        yield BillTabText(bill, source=text_url)
Exemplo n.º 14
0
    def process_item(self, item):
        # Convert names to title case as they are in all-caps
        name = CSS("span.name").match_one(item).text_content().strip()
        name = re.sub(r"^Hon\.", "", name, flags=re.IGNORECASE).strip().title()

        party = CSS("span.partido").match_one(item).text_content().strip()
        # Translate to English since being an Independent is a universal construct
        if party == "Independiente":
            party = "Independent"

        detail_link = CSS("a").match_one(item).get("href")
        partial = PartialSen(name=name, party=party, source=self.source.url)
        return SenDetail(partial, source=detail_link)
Exemplo n.º 15
0
 def process_page(self):
     com = self.input
     try:
         # This section has the chair memebers the regular, democratic and minority and the roles
         # main chair
         chair_member = (CSS(
             "div.MemberInfoList-MemberWrapper.ChairWrapper div.ChairNameText a"
         ).match(self.root)[0].text.strip())
         # main chair role
         chair_member_role = (CSS(
             "div.MemberInfoList-MemberWrapper.ChairWrapper div.ChairNameText div"
         ).match(self.root)[0].text.strip())
     except IndexError:
         pass
     try:
         com.add_member(fix_name(chair_member), chair_member_role)
         # Democratic Chair member and or the minority chair member
         demo_chair_member = (CSS(
             "div.MemberInfoList-MemberWrapper.ChairWrapper div.ChairNameText a"
         ).match(self.root)[1].text.strip())
         # Democratic Chair member and or the minority chair member role
         demo_chair_member_role = (CSS(
             "div.MemberInfoList-MemberWrapper.ChairWrapper div.ChairNameText div"
         ).match(self.root)[1].text.strip())
         com.add_member(fix_name(demo_chair_member), demo_chair_member_role)
     except IndexError:
         pass
     majority_members = CSS(
         ".Widget.CteeInfo-MajorityList .MemberInfoList-MemberWrapper.Member"
     ).match(self.root)
     for mem in majority_members:
         try:
             major_member_name = CSS("div a").match_one(mem).text.strip()
             major_mem_position = CSS(".position").match_one(
                 mem).text.strip()
         except SelectorError:
             major_mem_position = "member"
         com.add_member(fix_name(major_member_name), major_mem_position)
     minority_members = CSS(
         ".Widget.CteeInfo-MinorityList .MemberInfoList-MemberWrapper.Member"
     ).match(self.root)
     for mem in minority_members:
         try:
             minor_member_name = CSS("div a").match_one(mem).text.strip()
             minor_mem_position = CSS(".position").match_one(
                 mem).text.strip()
         except SelectorError:
             minor_mem_position = "member"
         com.add_member(fix_name(minor_member_name), minor_mem_position)
     return com
Exemplo n.º 16
0
    def process_item(self, item):
        # skip header rows
        if (
            len(CSS("td").match(item)) == 1
            or CSS("td").match(item)[0].get("class") == "header"
        ):
            self.skip()

        first_link = CSS("td a").match(item)[0]
        name = first_link.text_content()
        detail_link = first_link.get("href")

        district = CSS("td").match(item)[3].text_content()
        party_letter = CSS("td").match(item)[4].text_content()
        party_dict = {"D": "Democratic", "R": "Republican", "I": "Independent"}
        party = party_dict[party_letter]

        p = ScrapePerson(
            name=name,
            state="il",
            party=party,
            chamber=self.chamber,
            district=district,
        )

        p.add_source(self.source.url)
        p.add_source(detail_link)
        p.add_link(detail_link, note="homepage")

        return LegDetail(p, source=detail_link)
Exemplo n.º 17
0
    def process_item(self, item):
        name_dirty = CSS("h4 span").match_one(item).text_content().strip()
        if re.search(r"Vacant", name_dirty):
            self.skip()
        name_dirty = name_dirty.split(", ")
        last_name = name_dirty[0]
        first_name = name_dirty[1]
        name = first_name + " " + last_name

        district = CSS("i.fa.fa-map").match_one(
            item).getnext().text_content().strip()
        party = CSS("i.fa.fa-users").match_one(
            item).getnext().text_content().strip()
        if party == "Democrat":
            party = "Democratic"
        email = CSS("a").match(item)[2].text_content().strip()
        img = CSS("img").match_one(item).get("src")

        p = ScrapePerson(
            name=name,
            state="la",
            party=party,
            district=district,
            chamber=self.chamber,
            email=email,
            image=img,
        )

        detail_link = CSS("a").match(item)[1].get("href")

        p.add_source(self.source.url)
        p.add_source(detail_link)
        p.add_link(detail_link, note="homepage")

        return LegislatorDetail(p, source=detail_link)
Exemplo n.º 18
0
class LegPage(HtmlPage):
    name_css = CSS("h1.mt-0")
    district_css = CSS(".col-9 h2")
    image_css = CSS("img#sen-image")
    address_css = CSS("address")

    def process_page(self):
        name = self.name_css.match_one(self.root).text.replace("Sen. ",
                                                               "").strip()
        district = self.district_css.match_one(self.root).text.split()[1]
        image = self.image_css.match_one(self.root).get("src")
        addrlines = self.address_css.match_one(self.root).text_content()

        # example:
        # Room 11th Floor
        # P.O. Box 94604
        # Lincoln, NE 68509
        # (402) 471-2733
        # Email: [email protected]
        mode = "address"
        address = []
        phone = None
        email = None
        for line in addrlines.splitlines():
            line = line.strip()
            if not line:
                continue
            if line.startswith("(402)"):
                phone = line
                mode = None
            if line.startswith("Email:"):
                email = line.replace("Email: ", "")
            if mode == "address":
                address.append(line)

        p = Person(
            chamber="legislature",
            party="Nonpartisan",
            state="ne",
            district=district,
            image=image,
            name=name,
            email=email,
        )
        p.capitol_office.address = "; ".join(address)
        p.capitol_office.voice = phone
        p.add_source(self.source.url)
        p.add_link(self.source.url)
        return p
Exemplo n.º 19
0
    def process_item(self, item):
        name = CSS(".mediaCaptionTitle").match_one(item).text
        subtitle = CSS(".mediaCaptionSubtitle").match_one(item).text
        image = CSS(".photo").match_one(item).get("style")
        image = background_image_re.findall(image)[0]
        # e.g. District 25 | D
        district, party = subtitle.split(" | ")
        district = district.split()[1]
        party = {"D": "Democratic", "R": "Republican"}[party]

        return HouseDetail(
            HousePartial(
                name=name, district=district, party=party, url=item.get("href"), image=image,
            )
        )
Exemplo n.º 20
0
class SenateCommitteeList(HtmlListPage):
    source = "https://committees.senate.michigan.gov/"
    selector = CSS("form .col-md-6 ul li")
    chamber = "upper"

    def process_item(self, item):
        try:
            title = XPath("..//preceding-sibling::h3/text()").match(item)

        except SelectorError:
            title = XPath("../../..//preceding-sibling::h3/text()").match(item)

        for comm_name in title:
            if (comm_name == "Standing Committees"
                    or comm_name == "Appropriations Subcommittees"):
                name_link = CSS("a").match_one(item)
                name = name_link.text_content()
                source = name_link.get("href")
                if comm_name == "Standing Committees":
                    com = ScrapeCommittee(name=name, chamber=self.chamber)
                else:
                    com = ScrapeCommittee(
                        name=name,
                        classification="subcommittee",
                        chamber=self.chamber,
                        parent="Appropriations",
                    )
                return SenateCommitteeDetail(com, source=source)
            else:
                self.skip()
Exemplo n.º 21
0
class LegList(HtmlListPage):
    selector = CSS("a.Legislator-Card.col-md-4.col-sm-6.col-xs-12")

    def process_item(self, item):
        name = CSS("h3").match_one(item).text_content()
        if name == " - Vacant Seat":
            self.skip()

        party = CSS("small").match_one(item).text_content()
        if party == "Democrat":
            party = "Democratic"

        district = CSS("p").match(item)[0].text_content()
        district = (
            re.search(r"District:\r\n(.+)", district).groups()[0].strip().lstrip("0")
        )

        p = ScrapePerson(
            name=name,
            state="ky",
            party=party,
            chamber=self.chamber,
            district=district,
        )

        detail_link = item.get("href")

        p.add_source(self.source.url)
        p.add_source(detail_link)
        p.add_link(detail_link, note="homepage")

        return LegDetail(p, source=detail_link)
Exemplo n.º 22
0
    def process_item(self, item):
        link = (XPath(
            ".//div[contains(@class, 'container')]//a[contains(@href, 'members')]"
        ).match(item)[0].get("href"))
        name = CSS("h2 a").match(item)[0].text_content()
        com = ScrapeCommittee(name=name, chamber=self.chamber)

        for links in XPath(".//div[contains(@class, 'container')]//a").match(
                item):
            url = links.get("href")
            if url == link:
                continue
            else:
                if links == XPath(
                        ".//div[contains(@class, 'container')]//a[contains(@href, 'home')]"
                ).match_one(item):
                    com.add_link(url, note="homepage")
                    homepage = True
                else:
                    com.add_link(url)
        if not homepage:
            self.warn("no homepage found")

        com.add_source(self.source.url)
        return HouseCommitteeDetail(com, source=link)
Exemplo n.º 23
0
class HouseCommitteeList(HtmlListPage):
    selector = CSS(".mb-3 .card-body")
    source = "https://www.house.leg.state.mn.us/committees"
    chamber = "lower"

    def process_item(self, item):
        link = (XPath(
            ".//div[contains(@class, 'container')]//a[contains(@href, 'members')]"
        ).match(item)[0].get("href"))
        name = CSS("h2 a").match(item)[0].text_content()
        com = ScrapeCommittee(name=name, chamber=self.chamber)

        for links in XPath(".//div[contains(@class, 'container')]//a").match(
                item):
            url = links.get("href")
            if url == link:
                continue
            else:
                if links == XPath(
                        ".//div[contains(@class, 'container')]//a[contains(@href, 'home')]"
                ).match_one(item):
                    com.add_link(url, note="homepage")
                    homepage = True
                else:
                    com.add_link(url)
        if not homepage:
            self.warn("no homepage found")

        com.add_source(self.source.url)
        return HouseCommitteeDetail(com, source=link)
Exemplo n.º 24
0
    def process_page(self):
        # annapolis_info = (
        #     XPath("//dt[text()='Annapolis Info']/following-sibling::dd[1]")
        #     .match_one(self.root)
        #     .text_content()
        # )
        # interim_info = (
        #     XPath("//dt[text()='Interim Info']/following-sibling::dd[1]")
        #     .match_one(self.root)
        #     .text_content()
        # )

        # email is formatted mailto:<addr>?body...
        email = SimilarLink("mailto:").match_one(self.root).get("href")
        email = email.split(":", 1)[1].split("?")[0]

        p = Person(
            name=CSS("h2").match_one(self.root).text.split(" ", 1)[1],
            state="md",
            image=self.image_sel.match_one(self.root).get("src"),
            party=self.extract_dd("Party"),
            district=self.extract_dd("District"),
            chamber=None,
            email=email,
        )
        p.add_link(self.source.url)
        p.add_source(self.source.url)
        return p
Exemplo n.º 25
0
    def process_page(self):
        data_elem = CSS("#__NEXT_DATA__").match_one(self.root).text_content()
        data = json.loads(data_elem)
        for item in data["props"]["pageProps"]["legrosterData"][0]:
            first = item["First_Name"]
            middle = item["Middle_Name"]
            last = item["Last_Name"]
            suffix = item["Suffix"]
            member_id = item["BioLink"].split("/")[2]
            url = "https://www.njleg.state.nj.us" + item["BioLink"]
            party = {"D": "Democratic", "R": "Republican"}[item["Party"]]
            district = item["Roster_District"]
            chamber = "upper" if item["Roster_House"] == "Senate" else "lower"
            if middle:
                name = f"{first} {middle} {last}"
            else:
                name = f"{first} {last}"
            if suffix:
                name += f", {suffix}"

            p = ScrapePerson(
                name=name,
                given_name=first,
                family_name=last,
                state="nj",
                chamber=chamber,
                party=party,
                district=district,
            )
            p.add_source(self.source.url)
            p.add_source(url)
            p.add_link(url)
            api_url = f"https://www.njleg.state.nj.us/api/legislatorData/legislatorBio/{member_id}"
            p.add_source(api_url)
            yield LegDetail(p, source=api_url)
Exemplo n.º 26
0
class House(HtmlListPage):
    source = URL(
        "http://www.tucamarapr.org/dnncamara/ComposiciondelaCamara/Biografia.aspx"
    )
    selector = CSS("ul.list-article li", num_items=49)

    def process_item(self, item):
        bio_info = (CSS("div.biodiv a").match_one(
            item).text_content().strip().split("\n"))
        name = bio_info[0].strip()
        name = re.sub(r"^Hon\.", "", name, flags=re.IGNORECASE).strip()

        district = bio_info[2].strip()
        if district == "Representante por Acumulación":
            district = "At-Large"
        else:
            district = re.search(r"Representante\sdel\sDistrito\s(.+)",
                                 district).groups()[0]

        partial = PartialRep(name=name,
                             district=district,
                             source=self.source.url)

        detail_link = CSS("a").match_one(item).get("href")

        return RepDetail(partial, source=detail_link)
Exemplo n.º 27
0
class Representatives(HtmlListPage):
    # note: there is a CSV, but it requires a bunch of ASP.net hoops to actually get
    source = URL(
        "https://house.mo.gov/MemberGridCluster.aspx?year=2021&code=R+&filter=clear"
    )
    selector = CSS("tr")

    def process_item(self, item):
        tds = CSS("td").match(item, min_items=0, max_items=8)
        if not tds:
            self.skip()
        _, last, first, district, party, town, phone, room = tds
        if last.text_content() == "Vacant":
            self.skip()
        return HouseDetail(
            HousePartial(
                last_name=last.text_content(),
                first_name=first.text_content(),
                district=int(district.text_content()),
                party=party.text_content(),
                hometown=town.text_content().strip(),
                voice=phone.text_content(),
                room=room.text_content(),
                url=CSS("a").match_one(last).get("href"),
            )
        )
Exemplo n.º 28
0
class LegList(HtmlListPage):
    selector = CSS("div.MemberInfoList-MemberWrapper")

    def process_item(self, item):
        name_dirty = CSS("a").match_one(item).text_content().strip().split(
            ", ")
        name = name_dirty[1] + " " + name_dirty[0]

        district = CSS("br").match(item)[-1].tail.strip()
        district = re.search(r"District\s(.+)", district).groups()[0]

        party = CSS("b").match_one(item).tail.strip()
        if party == "(D)":
            party = "Democratic"
        elif party == "(R)":
            party = "Republican"
        elif party == "(I)":
            party = "Independent"

        p = ScrapePerson(
            name=name,
            state="pa",
            chamber=self.chamber,
            district=district,
            party=party,
        )

        detail_link = CSS("a").match_one(item).get("href")

        p.add_source(self.source.url)
        p.add_source(detail_link)
        p.add_link(detail_link, note="homepage")

        return LegDetail(p, source=URL(detail_link, timeout=10))
Exemplo n.º 29
0
class Legislators(HtmlListPage):
    selector = CSS("div.member")

    def process_item(self, item):
        name = CSS("a.membername").match_one(item).text_content()
        name = re.search(r"(Senator|Representative)\s(.+)", name).groups()[1]

        party = CSS("a.membername").match_one(item).tail.strip()
        if party == "(D)":
            party = "Democratic"
        elif party == "(R)":
            party = "Republican"

        district = CSS("div.district a").match_one(item).text_content().strip()
        district = re.search(r"District\s(.+)", district).groups()[0]

        p = ScrapePerson(
            name=name,
            state="sc",
            chamber=self.chamber,
            district=district,
            party=party,
        )

        detail_link = CSS("div.district a").match_one(item).get("href")

        p.add_source(self.source.url)
        p.add_source(detail_link)
        p.add_link(detail_link, note="homepage")

        img = CSS("img").match_one(item).get("src")
        p.image = img

        return LegDetail(p, source=URL(detail_link, timeout=20))
Exemplo n.º 30
0
    def process_page(self):
        p = self.input

        img = CSS("img.rounded").match_one(self.root).get("src")
        p.image = img

        contact_info = XPath("//strong[contains(text(), 'Contact Information')]").match(
            self.root
        )[0]
        cap_addr = contact_info.getnext().tail.strip()
        cap_addr += " "
        cap_addr += contact_info.getnext().getnext().tail.strip()
        cap_addr += " "
        cap_addr += contact_info.getnext().getnext().getnext().tail.strip()
        p.capitol_office.address = cap_addr

        try:
            phone = (
                XPath("//strong[contains(text(), 'Phone:')]")
                .match(self.root)[0]
                .tail.strip()
            )
            phone = re.search(r"(\d{3}-\d{3}-\d{4})(.+)?", phone).groups()[0]
            p.capitol_office.voice = phone
        except SelectorError:
            pass

        return p