Пример #1
0
    def process_page(self):
        data_elem = CSS("#__NEXT_DATA__").match_one(self.root).text_content()
        data = json.loads(data_elem)
        for item in data["props"]["pageProps"]["legrosterData"][0]:
            first = item["First_Name"]
            middle = item["Middle_Name"]
            last = item["Last_Name"]
            suffix = item["Suffix"]
            member_id = item["BioLink"].split("/")[2]
            url = "https://www.njleg.state.nj.us" + item["BioLink"]
            party = {"D": "Democratic", "R": "Republican"}[item["Party"]]
            district = item["Roster_District"]
            chamber = "upper" if item["Roster_House"] == "Senate" else "lower"
            if middle:
                name = f"{first} {middle} {last}"
            else:
                name = f"{first} {last}"
            if suffix:
                name += f", {suffix}"

            p = ScrapePerson(
                name=name,
                given_name=first,
                family_name=last,
                state="nj",
                chamber=chamber,
                party=party,
                district=district,
            )
            p.add_source(self.source.url)
            p.add_source(url)
            p.add_link(url)
            api_url = f"https://www.njleg.state.nj.us/api/legislatorData/legislatorBio/{member_id}"
            p.add_source(api_url)
            yield LegDetail(p, source=api_url)
Пример #2
0
    def process_page(self):
        name = self.name_css.match_one(self.root).text.split(maxsplit=1)[1]
        p = ScrapePerson(
            name=name,
            state="ok",
            chamber="upper",
            party=self.party_css.match_one(self.root).text,
            district=self.district_css.match_one(self.root).text.split()[1],
        )
        p.image = self.image_selector.match_one(self.root).get("href")

        contact_url = self.source.url.replace("District.aspx", "Contact.aspx")
        assert contact_url.startswith(
            "https://www.okhouse.gov/Members/Contact.aspx?District=")
        p.add_link(contact_url, note="Contact Form")

        # capitol address
        check_capitol_address = (CSS(".districtheadleft").match(
            self.root)[0].text_content().strip())
        if check_capitol_address == "Capitol Address:":
            capitol_address_div = (CSS(".districtheadleft + div").match(
                self.root)[0].text_content().strip().splitlines())
            p.capitol_office.address = "; ".join(
                [ln.strip() for ln in capitol_address_div[:-1]])
            p.capitol_office.phone = capitol_address_div[-1].strip()
        return p
Пример #3
0
    def process_page(self):
        member_code = self.data["MemberCode"]
        image = f"https://malegislature.gov/Legislators/Profile/170/{member_code}.jpg"
        chamber = "upper" if self.data["Branch"] == "Senate" else "lower"

        party = self.data["Party"]
        if party == "Unenrolled":
            party = "Independent"

        p = ScrapePerson(
            name=self.data["Name"],
            state="ma",
            party=party,
            district=self.data["District"],
            chamber=chamber,
            image=image,
            email=self.data["EmailAddress"],
        )

        room_num = self.data["RoomNumber"]
        if room_num:
            capitol_address = f"24 Beacon St., Room {room_num}; Boston, MA 02133"
            p.capitol_office.address = capitol_address

        # phone number and fax number (if it exists) are both from capitol office address
        phone = self.data["PhoneNumber"]
        numbers_only_phone_length = 10

        if phone:
            # there are 3 formats for phone numbers (some must be adjusted for extensions):
            # 61772228007309 is 617 722 2800x7309
            # (617) 722-1660 is (617) 722-1660
            # (617) 722-2800 x7306 is (617) 722-2800 x7306
            if (
                len(phone) > numbers_only_phone_length
                and " " not in phone
                and "x" not in phone
            ):
                phone = phone[:10] + " x" + phone[10:]

            p.capitol_office.voice = phone

        try:
            fax = self.data["FaxNumber"]
            if fax:
                p.capitol_office.fax = fax
        except SelectorError:
            pass

        if self.data["LeadershipPosition"]:
            p.extras["leadership position"] = self.data["LeadershipPosition"]

        p.extras["member code"] = member_code

        p.add_source(self.source.url)
        p.add_source(list_url())
        p.add_link(f"https://malegislature.gov/Legislators/Profile/{member_code}")

        return p
Пример #4
0
    def process_item(self, item):
        title = CSS("td").match(item)[0].text_content().strip()
        if title == "Representative":
            chamber = "lower"
        elif title == "Senator":
            chamber = "upper"

        district = CSS("td").match(item)[2].text_content()

        party = CSS("td").match(item)[3].text_content()
        if party == "Democrat":
            party = "Democratic"

        p = ScrapePerson(
            name="",
            state="co",
            party=party,
            chamber=chamber,
            district=district,
        )

        p.capitol_office.voice = CSS("td").match(
            item)[4].text_content().strip()
        p.email = CSS("td").match(item)[5].text_content().strip()

        detail_link = CSS("td a").match_one(item).get("href")

        p.add_source(self.source.url)
        p.add_source(detail_link)
        p.add_link(detail_link, note="homepage")

        return LegDetail(p, source=detail_link)
Пример #5
0
    def process_item(self, item):
        name = CSS("h3").match_one(item).text_content()
        district = CSS("p.list-district").match_one(item).text_content()
        district = re.search(r"District\s(\d+)", district).groups()[0]

        img = CSS("img").match_one(item).get("src")

        p = ScrapePerson(
            name=name,
            state="in",
            chamber=self.chamber,
            district=district,
            party=self.party,
            image=img,
        )

        if len(CSS("p").match(item)) > 2:
            title = CSS("p").match(item)[0].text_content()
            p.extras["title"] = title

        detail_link = CSS("a").match_one(item).get("href")

        p.add_source(self.source.url)
        p.add_source(detail_link)
        p.add_link(detail_link, note="homepage")
        return RedSenDetail(p, source=detail_link)
Пример #6
0
    def process_item(self, item):
        name_party = CSS("span").match(item)[0].text_content().strip().split(
            " - ")
        name = name_party[0].strip()
        party = name_party[1].strip()
        if party == "(D)":
            party = "Democratic"
        elif party == "(R)":
            party = "Republican"
        elif party == "(DTS)":
            party = "Independent"

        district = CSS("span").match(item)[1].text_content().strip()
        district = re.search(r"District:\s(.+)", district).groups()[0].strip()

        p = ScrapePerson(
            name=name,
            state="nm",
            chamber=self.chamber,
            district=district,
            party=party,
        )

        detail_link = CSS("a").match_one(item).get("href")

        p.add_source(self.source.url)
        p.add_source(detail_link)
        p.add_link(detail_link, note="homepage")

        img = CSS("img").match_one(item).get("src")
        p.image = img

        return LegDetail(p, source=detail_link)
Пример #7
0
    def process_item(self, row):
        if not row["First Name"]:
            return
        name = "{} {}".format(row["First Name"], row["Last Name"])
        party = PARTIES[row["Party"]]
        leg = ScrapePerson(
            name=name,
            district=row["District"].lstrip("0"),
            party=party,
            state="mn",
            chamber="upper",
            image=self.extra_info[name]["image"],
        )

        if "url" in self.extra_info[name]:
            leg.add_link(self.extra_info[name]["url"])
        if "office_phone" in self.extra_info[name]:
            leg.capitol_office.voice = self.extra_info[name]["office_phone"]
        if "email" in self.extra_info[name]:
            leg.email = self.extra_info[name]["email"]

        row["Zipcode"] = row["Zipcode"].strip()
        if (a in row["Address2"] for a in
            ["95 University Avenue W", "100 Rev. Dr. Martin Luther King"]):
            address = "{Address}\n{Address2}\n{City}, {State} {Zipcode}".format(
                **row)
            if "Rm. Number" in row:
                address = "{0} {1}".format(row["Rm. Number"], address)
        leg.capitol_office.address = address
        leg.add_source(self.source.url)
        leg.add_source(SEN_HTML_URL)
        return leg
Пример #8
0
    def process_item(self, item):
        name = CSS("a.membername").match_one(item).text_content()
        name = re.search(r"(Senator|Representative)\s(.+)", name).groups()[1]

        party = CSS("a.membername").match_one(item).tail.strip()
        if party == "(D)":
            party = "Democratic"
        elif party == "(R)":
            party = "Republican"

        district = CSS("div.district a").match_one(item).text_content().strip()
        district = re.search(r"District\s(.+)", district).groups()[0]

        p = ScrapePerson(
            name=name,
            state="sc",
            chamber=self.chamber,
            district=district,
            party=party,
        )

        detail_link = CSS("div.district a").match_one(item).get("href")

        p.add_source(self.source.url)
        p.add_source(detail_link)
        p.add_link(detail_link, note="homepage")

        img = CSS("img").match_one(item).get("src")
        p.image = img

        return LegDetail(p, source=URL(detail_link, timeout=20))
Пример #9
0
    def process_item(self, item):
        name = CSS("div a").match(item)[1].text_content()
        district = (
            CSS("div .esg-content.eg-senators-grid-element-1")
            .match_one(item)
            .text_content()
            .split("|")[1]
            .strip()
            .lower()
        )
        district = re.search(r"district\s(\d+)", district).groups()[0]
        img = CSS("div img").match_one(item).get("data-lazysrc")

        p = ScrapePerson(
            name=name,
            state="in",
            chamber=self.chamber,
            district=district,
            party=self.party,
            image=img,
        )

        city = (
            CSS("div .esg-content.eg-senators-grid-element-27")
            .match_one(item)
            .text_content()
        )
        p.extras["city"] = city

        detail_link = CSS("div a").match(item)[1].get("href")
        p.add_link(detail_link, note="homepage")
        p.add_source(self.source.url)
        p.add_source(detail_link)
        return BlueSenDetail(p, source=detail_link)
Пример #10
0
    def process_item(self, item):
        name = CSS("header").match_one(item).text_content()
        district = CSS("div.district").match_one(item).text_content()
        district = re.search(r"House\sDistrict\s(\d+)", district).groups()[0]

        img = CSS("img").match_one(item).get("src")

        p = ScrapePerson(
            name=name,
            state="in",
            chamber=self.chamber,
            district=district,
            party=self.party,
            image=img,
        )

        p.extras["city"] = CSS("div.city").match_one(item).text_content()

        detail_link = item.get("href")
        p.add_link(detail_link, note="homepage")
        detail_link_full = detail_link + "/full"
        p.add_source(detail_link_full)

        p.add_source(self.source.url)

        return BlueRepDetail(p, source=detail_link_full)
Пример #11
0
    def process_item(self, item):
        chamber_id = item["district"]["chamberType"]

        p = ScrapePerson(
            state="ga",
            chamber=self.chamber_types[chamber_id],
            district=str(item["district"]["number"]),
            name=item["fullName"],
            family_name=item["name"]["familyName"],
            given_name=item["name"]["first"],
            suffix=item["name"]["suffix"] or "",
            party=self.party_ids[item["party"]],
        )

        # district address
        da = item["districtAddress"]
        if da["email"]:
            p.email = da["email"]

        if da["phone"]:
            p.district_office.voice = da["phone"]
        if da["fax"]:
            p.district_office.fax = da["fax"]
        if da["address1"]:
            p.district_office.address = da["address1"]
            if da["address2"]:
                p.district_office.address += "; " + da["address2"]
            p.district_office.address += "; {city}, {state} {zip}".format(**da)
            p.district_office.address = p.district_office.address.strip()

        # photos
        if not item["photos"]:
            pass
        elif len(item["photos"]) == 1:
            p.image = item["photos"][0]["url"].split("?")[
                0]  # strip off ?size=mpSm for full size
        else:
            raise Exception("unknown photos configuration: " +
                            str(item["photos"]))

        # extras

        p.extras["residence"] = item["residence"]
        p.extras["city"] = item["city"].strip()
        p.extras["georgia_id"] = item["id"]

        url = (
            f"https://www.legis.ga.gov/members/{self.chamber_names[chamber_id]}/"
            f"{item['id']}?session={item['sessionId']}")
        p.add_source(url,
                     note="Initial list page (requires authorization token)")

        source = URL(
            f"https://www.legis.ga.gov/api/members/detail/{item['id']}?session=1029&chamber={chamber_id}",
            headers={"Authorization": get_token()},
        )

        return LegDetail(p, source=source)
Пример #12
0
    def process_item(self, item):
        name_dirty = CSS("h4 span").match_one(item).text_content().strip()
        if re.search(r"Vacant", name_dirty):
            self.skip()
        name_dirty = name_dirty.split(", ")
        last_name = name_dirty[0]
        first_name = name_dirty[1]
        name = first_name + " " + last_name

        district = CSS("i.fa.fa-map").match_one(
            item).getnext().text_content().strip()
        party = CSS("i.fa.fa-users").match_one(
            item).getnext().text_content().strip()
        if party == "Democrat":
            party = "Democratic"
        email = CSS("a").match(item)[2].text_content().strip()
        img = CSS("img").match_one(item).get("src")

        p = ScrapePerson(
            name=name,
            state="la",
            party=party,
            district=district,
            chamber=self.chamber,
            email=email,
            image=img,
        )

        detail_link = CSS("a").match(item)[1].get("href")

        p.add_source(self.source.url)
        p.add_source(detail_link)
        p.add_link(detail_link, note="homepage")

        return LegislatorDetail(p, source=detail_link)
Пример #13
0
    def process_page(self):
        for item in self.data["Data"]:
            name = item["PersonFullName"]
            party_code = item["PartyCode"]
            party_dict = {
                "D": "Democratic",
                "R": "Republican",
                "I": "Independent"
            }
            party = party_dict[party_code]
            district = item["DistrictNumber"]

            p = ScrapePerson(
                name=name,
                state="de",
                party=party,
                chamber=self.chamber,
                district=district,
            )

            p.add_source(self.source.url)
            detail_link = URL(
                f"https://legis.delaware.gov/LegislatorDetail?personId={item['PersonId']}"
            )
            p.add_source(detail_link.url)
            p.add_link(detail_link.url, note="homepage")

            yield LegDetail(p, source=detail_link.url)
Пример #14
0
    def process_item(self, item):
        member, party, district, contact_link, phone, office = item.getchildren(
        )

        name = member.text_content()
        district = district.text_content()

        # skip vacant districts
        if "Interim District" in name:
            self.skip()

        # each of these <td> have a single link
        leg_url = CSS("a").match_one(member).get("href")
        contact_url = CSS("a").match_one(contact_link).get("href")
        # construct this URL based on observation elsewhere on senate.michigan.gov
        image_url = (
            f"https://senate.michigan.gov/_images/{district}{ord_suffix(district)}.jpg"
        )

        p = ScrapePerson(
            **split_name(name),
            state="mi",
            chamber="upper",
            district=district,
            party=self.PARTY_MAP[party.text],
            image=image_url,
        )
        p.capitol_office.voice = str(phone.text_content())
        p.capitol_office.address = str(office.text_content())
        p.add_source(self.source.url)
        p.add_link(leg_url)
        p.add_link(contact_url, note="Contact")
        return p
Пример #15
0
    def process_item(self, item):
        name_dirty = CSS("a").match_one(item).text_content().strip().split(
            ", ")
        name = name_dirty[1] + " " + name_dirty[0]

        district = CSS("br").match(item)[-1].tail.strip()
        district = re.search(r"District\s(.+)", district).groups()[0]

        party = CSS("b").match_one(item).tail.strip()
        if party == "(D)":
            party = "Democratic"
        elif party == "(R)":
            party = "Republican"
        elif party == "(I)":
            party = "Independent"

        p = ScrapePerson(
            name=name,
            state="pa",
            chamber=self.chamber,
            district=district,
            party=party,
        )

        detail_link = CSS("a").match_one(item).get("href")

        p.add_source(self.source.url)
        p.add_source(detail_link)
        p.add_link(detail_link, note="homepage")

        return LegDetail(p, source=URL(detail_link, timeout=10))
Пример #16
0
    def process_item(self, item):
        # skip header rows
        if (
            len(CSS("td").match(item)) == 1
            or CSS("td").match(item)[0].get("class") == "header"
        ):
            self.skip()

        first_link = CSS("td a").match(item)[0]
        name = first_link.text_content()
        detail_link = first_link.get("href")

        district = CSS("td").match(item)[3].text_content()
        party_letter = CSS("td").match(item)[4].text_content()
        party_dict = {"D": "Democratic", "R": "Republican", "I": "Independent"}
        party = party_dict[party_letter]

        p = ScrapePerson(
            name=name,
            state="il",
            party=party,
            chamber=self.chamber,
            district=district,
        )

        p.add_source(self.source.url)
        p.add_source(detail_link)
        p.add_link(detail_link, note="homepage")

        return LegDetail(p, source=detail_link)
Пример #17
0
    def process_item(self, item):
        name = CSS("h3").match_one(item).text_content()
        if name == " - Vacant Seat":
            self.skip()

        party = CSS("small").match_one(item).text_content()
        if party == "Democrat":
            party = "Democratic"

        district = CSS("p").match(item)[0].text_content()
        district = (
            re.search(r"District:\r\n(.+)", district).groups()[0].strip().lstrip("0")
        )

        p = ScrapePerson(
            name=name,
            state="ky",
            party=party,
            chamber=self.chamber,
            district=district,
        )

        detail_link = item.get("href")

        p.add_source(self.source.url)
        p.add_source(detail_link)
        p.add_link(detail_link, note="homepage")

        return LegDetail(p, source=detail_link)
Пример #18
0
    def process_page(self):
        p = ScrapePerson(
            state="fl",
            chamber="lower",
            name=fix_name(self.input.name),
            party=str(self.input.party),
            district=str(self.input.district),
            image=self.input.image,
        )
        for otype in ("district", "capitol"):
            odoc = self.root.xpath(
                f"//h3[@id='{otype}-office']/following-sibling::ul")
            if odoc:
                odoc = odoc[0]
            else:
                continue
            spans = odoc.xpath(".//span")

            office = p.capitol_office if otype == "capitol" else p.district_office
            office.address = "; ".join(
                line.strip()
                for line in spans[0].text_content().strip().splitlines()
                if line.strip())
            office.voice = spans[1].text_content().strip()

        return p
Пример #19
0
    def process_item(self, item):
        name = CSS("a").match(item)[2].text_content()
        name = re.sub(r"Contact Assembly Member", "", name).strip()

        party = CSS("td").match(item)[2].text_content().strip()
        if party == "Democrat":
            party = "Democratic"

        district = CSS("td").match(item)[1].text_content().strip().lstrip("0")

        # District 18 has a vacant spot
        if name == "edit":
            self.skip("skipping Vacant seat in District {}".format(district))

        photo_url = CSS("img").match(item, min_items=0)
        if photo_url:
            photo_url = photo_url[0].get("src")

        p = ScrapePerson(
            name=name,
            state="ca",
            chamber="lower",
            district=district,
            party=party,
            image=photo_url,
        )

        capitol_office_header = CSS("h3").match(item)[0].text_content()
        capitol_office_text = (
            XPath(
                "//*[@id='block-views-view-members-block-1']/div/div/div/table/tbody/tr[1]/td[4]/text()"
            )
            .match(item)[1]
            .strip()
        )
        capitol_office_text, capitol_office_phone = capitol_office_text.split("; ")
        capitol_office_address = capitol_office_header + capitol_office_text

        p.capitol_office.address = capitol_office_address
        p.capitol_office.voice = capitol_office_phone

        district_offices = XPath(".//td/p[1]/text()").match(item)

        for office in district_offices:
            district_address, district_phone = office.split("; ")
            p.add_office(
                classification="district",
                address=district_address.strip(),
                voice=district_phone.strip(),
            )

        url = CSS("a").match(item)[0].get("href")
        p.add_link(url)
        p.add_source(self.source.url)

        return p
Пример #20
0
    def process_item(self, item):
        tds = item.getchildren()
        email, name, party, seat, phone = tds

        chamber, district = seat.text_content().strip().split()
        url = str(name.xpath("a/@href")[0])

        person = ScrapePerson(
            name=clean_name(name.text_content()),
            state="mt",
            party=party.text_content().strip(),
            chamber=("upper" if chamber == "SD" else "lower"),
            district=district,
        )
        person.add_link(url)
        person.add_source(url)

        phone = phone.text_content().strip()
        if len(phone) == 14:
            person.capitol_office.voice = phone
        elif len(phone) > 30:
            person.capitol_office.voice = phone.split("    ")[0]

        email = email.xpath("./a/@href")
        if email:
            person.email = email[0].split(":", 1)[1]

        return person
Пример #21
0
    def process_item(self, item):
        if CSS("td").match(item)[1].text_content().strip() == "Vacant":
            return
        elif CSS("td").match(item)[1].text_content().strip() == "Martin, Greg":
            return
        else:
            name_dirty = CSS("td").match(item)[1].text_content().strip().split(
                ", ")
            name = name_dirty[1] + " " + name_dirty[0]
            if "Speaker" in name:
                name = re.sub(r"Speaker ", "", name)

            party = CSS("td").match(item)[2].text_content().strip()
            if party == "D":
                party = "Democratic"
            elif party == "R":
                party = "Republican"

            district = CSS("td").match(item)[4].text_content().strip()
            district = re.search(r"District\s(.+)", district).groups()[0]

            p = ScrapePerson(
                name=name,
                state="tn",
                chamber=self.chamber,
                district=district,
                party=party,
            )

            detail_link = CSS("td a").match(item)[1].get("href")

            p.add_source(self.source.url)
            p.add_source(detail_link)
            p.add_link(detail_link, note="homepage")

            email = CSS("td a").match(item)[0].get("href")
            email = re.search(r"mailto:(.+)", email).groups()[0]
            p.email = email

            # this is also being grabbed above in capitol_office.address
            office_room = CSS("td").match(item)[5].text_content().strip()
            p.extras["office"] = office_room

            return LegDetail(p, source=detail_link)
Пример #22
0
    def process_page(self):
        for bio in CSS(".bSenBio__infoIt").match(self.root):
            if "Party:" in bio.text_content():
                party = bio.text_content().split(":")[1].strip()
        p = ScrapePerson(
            name=self.name_css.match_one(self.root).text,
            state="ok",
            chamber="upper",
            party=party,
            image=self.image_css.match_one(self.root).get("href"),
            district=self.district_css.match_one(self.root).text.strip().split()[1],
        )
        p.capitol_office.address = self.address_css.match_one(self.root).text
        p.capitol_office.voice = self.phone_css.match_one(self.root).text
        p.add_link(
            self.contact_link_sel.match_one(self.root).get("href"), "Contact Form"
        )

        return p
Пример #23
0
    def process_item(self, item):
        try:
            name = name_title = CSS("a").match(item)[0].text_content()
        except SelectorError:
            self.skip("header row")

        if "--" in name_title:
            name, title = [word.strip() for word in name.split("--")]

        _, district, party, email, room, capitol_phone = item.getchildren()

        district = district.text_content()

        party = party.text_content()
        if party == "R":
            party = "Republican"
        elif party == "D":
            party = "Democratic"

        email = email.text_content()
        if email.startswith("Email: "):
            email = email.replace("Email: ", "").lower() + "@azleg.gov"
        else:
            email = ""

        room = room.text_content()
        if self.chamber == "lower":
            address = "House of Representatives\n "
        elif self.chamber == "upper":
            address = "Senate\n "
        address = address + "1700 West Washington\n " + room + "\nPhoenix, AZ 85007"

        capitol_phone = capitol_phone.text_content()

        image = CSS("td a img").match(item)
        if image:
            image = image[0].get("src")

        p = ScrapePerson(
            name=name,
            state="az",
            chamber=self.chamber,
            district=district,
            party=party,
            email=email,
            image=image,
        )

        p.capitol_office.address = address
        p.capitol_office.voice = capitol_phone
        p.add_source(self.source.url)
        p.add_link(CSS("a").match(item)[0].get("href"))

        if "--" in name_title:
            p.extras["title"] = title
        return p
Пример #24
0
    def process_item(self, item):
        if "Vacant" in item.text_content():
            self.skip("vacant")
        link = item.xpath(".//a")[0]
        url = link.get("href")
        (
            name,
            party,
            district,
        ) = re.match(r"\s+([^\(]+)\((\w+)\)\s+District-(\d+)",
                     link.text).groups()

        contact = item.getchildren()[1].getchildren()[0:3]
        office = contact[0].text_content().strip()
        phone = contact[1].text_content().strip()
        email = contact[2].text_content().strip()

        p = ScrapePerson(
            **split_name(name),
            state="mi",
            chamber="lower",
            district=district,
            party=party,
            email=email,
        )
        if url.startswith("http:/r"):
            url = url.replace("http:/", "http://")
        p.add_link(url)
        p.add_source(self.source.url)
        p.capitol_office.voice = phone
        p.capitol_office.address = office
        return p
Пример #25
0
    def process_item(self, item):
        website, district, name, party, office, phone, email = item.getchildren(
        )

        # skip header row
        if website.tag == "th":
            self.skip()

        office = office.text_content()
        for abbr, full in self.office_names.items():
            office = office.replace(abbr, full)

        p = ScrapePerson(
            name=name.text_content(),
            state="mi",
            chamber="lower",
            district=district.text_content().lstrip("0"),
            party=party.text_content(),
            email=email.text_content(),
        )
        link = CSS("a").match_one(website).get("href")
        if link.startswith("http:/r"):
            link = link.replace(":/", "://")
        p.add_link(link)
        p.add_source(self.source.url)
        p.capitol_office.voice = phone.text_content()
        p.capitol_office.address = office
        return p
Пример #26
0
    def process_page(self):
        # construct person from the details from above
        p = ScrapePerson(
            state="oh",
            chamber="lower",
            district=self.input.district,
            name=self.input.name,
            party=self.input.party,
            image=self.input.image,
        )
        p.add_source(self.input.url)
        p.add_link(self.input.url)

        divs = CSS(".member-info-bar-module").match(self.root)
        # last div is contact details
        contact_details = CSS(".member-info-bar-value").match(divs[-1])
        for div in contact_details:
            dtc = div.text_content()
            if ", OH" in dtc:
                # join parts of the div together to make whole address
                children = div.getchildren()
                p.capitol_office.address = "; ".join([
                    children[0].text.strip(),
                    children[0].tail.strip(),
                    children[1].tail.strip(),
                ])
            elif "Phone:" in dtc:
                p.capitol_office.voice = dtc.split(": ")[1]
            elif "Fax:" in dtc:
                p.capitol_office.fax = dtc.split(": ")[1]

        return p
Пример #27
0
    def process_item(self, item):
        try:
            link = CSS("a").match(item)[1]
        except SelectorError:
            self.skip()
        data = {
            "last_name": link.text_content(),
            "url": link.get("href"),
        }
        for key, label in self.LABELS.items():
            data[key] = CSS(f"[id$={label}]").match_one(
                item).text_content().strip()

        party = {"(D)": "Democratic", "(R)": "Republican"}[data["party"]]
        address = "Hawaii State Capitol, Room " + data["room"]
        chamber = "upper" if data["chamber"] == "S" else "lower"

        p = ScrapePerson(
            name=data["first_name"] + " " + data["last_name"],
            state="hi",
            chamber=chamber,
            district=data["district"],
            given_name=data["first_name"],
            family_name=data["last_name"],
            party=party,
            email=data["email"],
        )
        p.capitol_office.address = address
        p.capitol_office.voice = data["voice"]
        p.capitol_office.fax = data["fax"]
        p.add_source(data["url"])
        p.add_link(data["url"])
        return p
Пример #28
0
    def process_page(self):
        # annapolis_info = (
        #     XPath("//dt[text()='Annapolis Info']/following-sibling::dd[1]")
        #     .match_one(self.root)
        #     .text_content()
        # )
        # interim_info = (
        #     XPath("//dt[text()='Interim Info']/following-sibling::dd[1]")
        #     .match_one(self.root)
        #     .text_content()
        # )

        # email is formatted mailto:<addr>?body...
        email = SimilarLink("mailto:").match_one(self.root).get("href")
        email = email.split(":", 1)[1].split("?")[0]

        p = ScrapePerson(
            name=CSS("h2").match_one(self.root).text.split(" ", 1)[1],
            state="md",
            image=self.image_sel.match_one(self.root).get("src"),
            party=self.extract_dd("Party"),
            district=self.extract_dd("District"),
            chamber=None,
            email=email,
        )
        p.add_link(self.source.url)
        p.add_source(self.source.url)
        return p
Пример #29
0
    def process_page(self):
        party = {"D": "Democratic", "R": "Republican"}[self.input.party]

        photo = CSS("img#ContentPlaceHolder1_imgPhoto1").match_one(self.root).get("src")

        p = ScrapePerson(
            state="mo",
            party=party,
            image=photo,
            chamber="lower",
            district=self.input.district,
            name=f"{self.input.first_name} {self.input.last_name}",
            given_name=self.input.first_name,
            family_name=self.input.last_name,
        )
        # TODO
        # p.extras["hometown"] = self.input.hometown
        p.capitol_office.voice = self.input.voice
        p.capitol_office.address = (
            "MO House of Representatives; 201 West Capitol Avenue; "
            f"Room {self.input.room}; Jefferson City MO 65101 "
        )
        p.add_link(self.input.url)
        p.add_source(self.input.url)
        return p
Пример #30
0
    def process_page(self):
        p = ScrapePerson(
            name=self.input.name,
            state="tx",
            party=self.input.party,
            district=self.input.district,
            chamber="lower",
            image=self.input.image,
        )

        def office_name(element):
            """Returns the office address type."""
            return element.xpath("preceding-sibling::h4[1]/text()")[0].rstrip(
                ":")

        offices_text = [{
            "label": office_name(p_tag),
            "type": office_name(p_tag),
            "details": p_tag.text_content(),
        } for p_tag in self.root.xpath(
            '//h4/following-sibling::p[@class="double-space"]')]

        for office_text in offices_text:
            details = office_text["details"].strip()

            # A few member pages have blank office listings:
            if details == "":
                continue

            # At the time of writing, this case of multiple district
            # offices occurs exactly once, for the representative at
            # District 4:
            if details.count("Office") > 1:
                district_offices = [
                    district_office.strip() for district_office in re.findall(
                        r"(\w+ Office.+?(?=\w+ Office|$))",
                        details,
                        flags=re.DOTALL)
                ]
                offices_text += [{
                    "label":
                    re.match(r"\w+ Office", office).group(),
                    "type":
                    "District Address",
                    "details":
                    re.search(r"(?<=Office).+(?=\w+ Office|$)?", office,
                              re.DOTALL).group(),
                } for office in district_offices]

            process_address(details, p, office_text)

        return p