Пример #1
0
    def process_item(self, item):
        # strip leading zero
        district = str(int(item.get("id")))
        image = CSS(".mem-pic a img").match_one(item).get("src")
        name = CSS(".mem-name a").match_one(item)

        district_addr, capitol_addr = self.process_addresses(item)

        # email, twitter, facebook are all sometimes present
        try:
            email = CSS(".mem-email a").match_one(item).text.strip()
        except SelectorError:
            email = ""
        try:
            twitter = CSS(".fa-twitter").match_one(item)
            twitter = twitter.getparent().get("href").split("/")[-1]
        except SelectorError:
            twitter = ""
        try:
            facebook = CSS(".fa-facebook").match_one(item)
            facebook = facebook.getparent().get("href").split("/")[-1]
        except SelectorError:
            facebook = ""

        party = self.party_mapping[district][1]

        p = ScrapePerson(
            state="ny",
            chamber="lower",
            image=image,
            party=party,
            district=district,
            name=name.text.strip(),
            email=email,
        )
        p.add_link(url=name.get("href"))
        p.add_source(url=name.get("href"))
        if twitter:
            p.ids.twitter = twitter
        if facebook:
            p.ids.facebook = facebook
        p.district_office.address = district_addr["address"]
        p.district_office.voice = district_addr["phone"] or ""
        p.district_office.fax = district_addr["fax"] or ""
        p.capitol_office.address = capitol_addr["address"]
        p.capitol_office.voice = capitol_addr["phone"] or ""
        p.capitol_office.fax = capitol_addr["fax"] or ""
        return p
Пример #2
0
    def process_item(self, item):
        try:
            title = XPath("..//preceding-sibling::h3/text()").match(item)

        except SelectorError:
            title = XPath("../../..//preceding-sibling::h3/text()").match(item)

        for comm_name in title:
            if (comm_name == "Standing Committees"
                    or comm_name == "Appropriations Subcommittees"):
                name_link = CSS("a").match_one(item)
                name = name_link.text_content()
                source = name_link.get("href")
                if comm_name == "Standing Committees":
                    com = ScrapeCommittee(name=name, chamber=self.chamber)
                else:
                    com = ScrapeCommittee(
                        name=name,
                        classification="subcommittee",
                        chamber=self.chamber,
                        parent="Appropriations",
                    )
                return SenateCommitteeDetail(com, source=source)
            else:
                self.skip()
Пример #3
0
    def process_item(self, item):
        try:
            link = CSS("a").match(item)[1]
        except SelectorError:
            self.skip()
        data = {
            "last_name": link.text_content(),
            "url": link.get("href"),
        }
        for key, label in self.LABELS.items():
            data[key] = CSS(f"[id$={label}]").match_one(
                item).text_content().strip()

        party = {"(D)": "Democratic", "(R)": "Republican"}[data["party"]]
        address = "Hawaii State Capitol, Room " + data["room"]
        chamber = "upper" if data["chamber"] == "S" else "lower"

        p = ScrapePerson(
            name=data["first_name"] + " " + data["last_name"],
            state="hi",
            chamber=chamber,
            district=data["district"],
            given_name=data["first_name"],
            family_name=data["last_name"],
            party=party,
            email=data["email"],
        )
        p.capitol_office.address = address
        p.capitol_office.voice = data["voice"]
        p.capitol_office.fax = data["fax"]
        p.add_source(data["url"])
        p.add_link(data["url"])
        return p
Пример #4
0
    def process_item(self, item):
        # skip header rows
        if (
            len(CSS("td").match(item)) == 1
            or CSS("td").match(item)[0].get("class") == "header"
        ):
            self.skip()

        first_link = CSS("td a").match(item)[0]
        name = first_link.text_content()
        detail_link = first_link.get("href")

        district = CSS("td").match(item)[3].text_content()
        party_letter = CSS("td").match(item)[4].text_content()
        party_dict = {"D": "Democratic", "R": "Republican", "I": "Independent"}
        party = party_dict[party_letter]

        p = ScrapePerson(
            name=name,
            state="il",
            party=party,
            chamber=self.chamber,
            district=district,
        )

        p.add_source(self.source.url)
        p.add_source(detail_link)
        p.add_link(detail_link, note="homepage")

        return LegDetail(p, source=detail_link)
Пример #5
0
 def process_item(self, item):
     com_link = CSS("a").match(item)[0]
     name = com_link.text_content()
     com = ScrapeCommittee(name=name,
                           classification="committee",
                           chamber=self.chamber)
     detail_link = com_link.get("href")
     com.add_source(detail_link)
     com.add_link(detail_link, note="homepage")
     return CommitteeDetail(com, source=detail_link)
Пример #6
0
    def process_page(self):
        p = self.input

        district = CSS("div.hidden-xs.mem-info h3").match_one(self.root).text_content()
        title, district = re.search(r"(.+)\s\|\sDistrict\s(\d+)", district).groups()
        p.district = district
        if title != "Representative":
            p.extras["title"] = title

        assistant = CSS("div.hidden-xs.mem-info a").match(self.root)[0]
        assistant_name = assistant.text_content()
        assistant_email = assistant.get("href")
        assistant_email = re.search(r"mailto:(.+)", assistant_email).groups()[0]
        assistant_phones = (
            CSS("div.hidden-xs.mem-info p.no-margin").match(self.root)[1].text_content()
        )
        phone1, phone2 = re.search(r"Phone:\s(.+)\s\|\s(.+)", assistant_phones).groups()

        p.extras["assistant name"] = assistant_name
        p.extras["assistant email"] = assistant_email
        p.extras["assistant phone1"] = phone1
        p.extras["assistant phone2"] = phone2

        press_name = (
            CSS("div.hidden-xs.mem-info div.small-block.last p")
            .match(self.root)[0]
            .text_content()
        )
        press_phone = (
            CSS("div.hidden-xs.mem-info div.small-block.last p")
            .match(self.root)[1]
            .text_content()
        )
        press_phone = re.search(r"Phone:\s(.+)", press_phone).groups()[0]
        press_email = (
            CSS("div.hidden-xs.mem-info div.small-block.last a")
            .match_one(self.root)
            .text_content()
        )

        p.extras["press contact name"] = press_name
        p.extras["press contact phone"] = press_phone
        p.extras["press contact email"] = press_email

        return p
Пример #7
0
    def process_item(self, item):
        com_link = CSS("a").match_one(item)
        name = com_link.text_content()

        com = ScrapeCommittee(
            name=name,
            chamber=self.chamber,
        )

        detail_link = com_link.get("href")

        com.add_source(self.source.url)
        com.add_source(detail_link)
        com.add_link(detail_link, note="homepage")

        # this link has broken html (not able to grab member info)
        # just returning name, chamber, and link
        if detail_link == "https://legislature.idaho.gov/sessioninfo/2021/joint/cec/":
            return com

        return DetailCommitteePage(com, source=detail_link)