示例#1
0
    def process_page(self):
        # annapolis_info = (
        #     XPath("//dt[text()='Annapolis Info']/following-sibling::dd[1]")
        #     .match_one(self.root)
        #     .text_content()
        # )
        # interim_info = (
        #     XPath("//dt[text()='Interim Info']/following-sibling::dd[1]")
        #     .match_one(self.root)
        #     .text_content()
        # )

        # email is formatted mailto:<addr>?body...
        email = SimilarLink("mailto:").match_one(self.root).get("href")
        email = email.split(":", 1)[1].split("?")[0]

        p = Person(
            name=CSS("h2").match_one(self.root).text.split(" ", 1)[1],
            state="md",
            image=self.image_sel.match_one(self.root).get("src"),
            party=self.extract_dd("Party"),
            district=self.extract_dd("District"),
            chamber=None,
            email=email,
        )
        p.add_link(self.source.url)
        p.add_source(self.source.url)
        return p
示例#2
0
class SenateDetail(HtmlPage):
    name_css = CSS(".field--name-title")
    image_css = CSS(".bSenBio__media-btn")
    district_css = CSS(".bDistrict h2")
    address_css = CSS(".bSenBio__address p")
    phone_css = CSS(".bSenBio__tel a")
    contact_link_sel = SimilarLink(
        r"https://oksenate.gov/contact-senator\?sid=")

    def process_page(self):
        for bio in CSS(".bSenBio__infoIt").match(self.root):
            if "Party:" in bio.text_content():
                party = bio.text_content().split(":")[1].strip()
        p = ScrapePerson(
            name=self.name_css.match_one(self.root).text,
            state="ok",
            chamber="upper",
            party=party,
            image=self.image_css.match_one(self.root).get("href"),
            district=self.district_css.match_one(
                self.root).text.strip().split()[1],
        )
        p.capitol_office.address = self.address_css.match_one(self.root).text
        p.capitol_office.phone = self.phone_css.match_one(self.root).text
        p.add_link(
            self.contact_link_sel.match_one(self.root).get("href"),
            "Contact Form")

        return p
示例#3
0
class HouseDetail(HtmlPage):
    image_selector = SimilarLink(
        "https://www.okhouse.gov/Members/Pictures/HiRes/")
    prefix = "#ctl00_ContentPlaceHolder1_lbl"
    name_css = CSS(prefix + "Name")
    district_css = CSS(prefix + "District")
    party_css = CSS(prefix + "Party")

    def process_page(self):
        name = self.name_css.match_one(self.root).text.split(maxsplit=1)[1]
        p = ScrapePerson(
            name=name,
            state="ok",
            chamber="upper",
            party=self.party_css.match_one(self.root).text,
            district=self.district_css.match_one(self.root).text.split()[1],
        )
        p.image = self.image_selector.match_one(self.root).get("href")

        contact_url = self.source.url.replace("District.aspx", "Contact.aspx")
        assert contact_url.startswith(
            "https://www.okhouse.gov/Members/Contact.aspx?District=")
        p.add_link(contact_url, note="Contact Form")

        # capitol address
        check_capitol_address = (CSS(".districtheadleft").match(
            self.root)[0].text_content().strip())
        if check_capitol_address == "Capitol Address:":
            capitol_address_div = (CSS(".districtheadleft + div").match(
                self.root)[0].text_content().strip().splitlines())
            p.capitol_office.address = "; ".join(
                [ln.strip() for ln in capitol_address_div[:-1]])
            p.capitol_office.phone = capitol_address_div[-1].strip()
        return p
示例#4
0
class House(HtmlListPage):
    selector = SimilarLink(
        r"https://www.okhouse.gov/Members/District.aspx\?District=", num_items=101
    )
    source = "https://www.okhouse.gov/Members/Default.aspx"

    def process_item(self, item):
        return HouseDetail({"name": item.text.strip()}, source=item.get("href"))
示例#5
0
class Senate(HtmlListPage):
    selector = SimilarLink("https://oksenate.gov/senators/", min_items=45, max_items=48)
    source = "https://oksenate.gov/senators"

    def process_item(self, item):
        party, _, district, name = item.text_content().split(maxsplit=3)
        return SenateDetail(
            {"party": party, "district": district, "name": name.strip()},
            source=item.get("href"),
        )
示例#6
0
def test_similar_link_selector():
    root = lxml.etree.fromstring(dummy_html)
    assert len(SimilarLink("https").match(root)) == 2