Пример #1
0
    def process_item(self, item):
        name_dirty = CSS("h4 span").match_one(item).text_content().strip()
        if re.search(r"Vacant", name_dirty):
            self.skip()
        name_dirty = name_dirty.split(", ")
        last_name = name_dirty[0]
        first_name = name_dirty[1]
        name = first_name + " " + last_name

        district = CSS("i.fa.fa-map").match_one(
            item).getnext().text_content().strip()
        party = CSS("i.fa.fa-users").match_one(
            item).getnext().text_content().strip()
        if party == "Democrat":
            party = "Democratic"
        email = CSS("a").match(item)[2].text_content().strip()
        img = CSS("img").match_one(item).get("src")

        p = ScrapePerson(
            name=name,
            state="la",
            party=party,
            district=district,
            chamber=self.chamber,
            email=email,
            image=img,
        )

        detail_link = CSS("a").match(item)[1].get("href")

        p.add_source(self.source.url)
        p.add_source(detail_link)
        p.add_link(detail_link, note="homepage")

        return LegislatorDetail(p, source=detail_link)
Пример #2
0
    def process_item(self, item):
        name = CSS(".mediaCaptionTitle").match_one(item).text
        subtitle = CSS(".mediaCaptionSubtitle").match_one(item).text
        image = CSS(".photo").match_one(item).get("style")
        image = background_image_re.findall(image)[0]
        # e.g. District 25 | D
        district, party = subtitle.split(" | ")
        district = district.split()[1]
        party = {"D": "Democratic", "R": "Republican"}[party]

        return HouseDetail(
            HousePartial(
                name=name, district=district, party=party, url=item.get("href"), image=image,
            )
        )
Пример #3
0
    def process_page(self):
        p = self.input

        email = CSS("div.sen-contact a").match(self.root)[0].get("href")
        email = re.search(r"mailto:(.+)", email).groups()[0]
        p.email = email

        addr = CSS("div.sen-contact p").match(self.root)[0].text_content()

        # no phone for this link
        if self.source.url == "https://www.indianasenaterepublicans.com/young":
            addr = addr
            phone1 = None
            phone2 = None
        else:
            addr, phone1, phone2 = re.search(
                r"(.+)Phone:\s(\d{3}-\d{3}-\d{4})\s?or\s(\d{3}-\d{3}-\d{4})", addr
            ).groups()

        p.capitol_office.address = addr
        if phone1:
            p.capitol_office.voice = phone1
        if phone2:
            p.extras["second phone"] = phone2

        if len(CSS("div.sen-contact p").match(self.root)) == 1:
            leg_assist = CSS("div.sen-contact p").match_one(self.root).text_content()
        else:
            leg_assist = CSS("div.sen-contact p").match(self.root)[1].text_content()

        if len(CSS("div.sen-contact p").match(self.root)) < 3:
            extra_contacts = leg_assist.split("Media Contact:")
            leg_assist = extra_contacts[0]
            media_contact = extra_contacts[1]
            leg_assist_name, leg_assist_phone, leg_assist_email = re.search(
                r"Legislative\sAssistant:?(.+)Phone:\s(.+)Email:\s(.+)", leg_assist
            ).groups()
            media_contact_name, media_contact_phone, media_contact_email = re.search(
                r"(.+)Phone:\s(.+)Email:\s(.+)", media_contact
            ).groups()
        elif (
            len(CSS("div.sen-contact p").match(self.root)) == 3
            or self.source.url == "https://www.indianasenaterepublicans.com/bray"
        ):
            leg_assist_name, leg_assist_phone, leg_assist_email = re.search(
                r"Legislative\sAssistant:?(.+)Phone:\s(.+)Email:\s(.+)", leg_assist
            ).groups()
            media_contact = CSS("div.sen-contact p").match(self.root)[2].text_content()
            media_contact_name, media_contact_phone, media_contact_email = re.search(
                r"Media\sContact:(.+)Phone:\s(.+)Email:\s(.+)", media_contact
            ).groups()
        else:
            leg_assist_name, leg_assist_phone, leg_assist_email = re.search(
                r"Legislative\sAssistant:?(.+)Phone:\s(.+)Email:\s(.+)", leg_assist
            ).groups()
            media_contact = CSS("div.sen-contact p").match(self.root)[3].text_content()
            media_contact_name, media_contact_phone, media_contact_email = re.search(
                r"Media\sContact:(.+)Phone:\s(.+)Email:\s(.+)", media_contact
            ).groups()

        p.extras["legislative assistant name"] = leg_assist_name
        p.extras["legislative assistant phone"] = leg_assist_phone
        p.extras["legislative assistant email"] = leg_assist_email
        p.extras["media contact name"] = media_contact_name
        p.extras["media contact phone"] = media_contact_phone
        p.extras["media contact email"] = media_contact_email

        """
        try:
            # need to deal with multi-lines of education
            print(
                XPath("//h3[contains(text(), 'Education')]")
                .match(self.root)[0]
                .getnext()
                .text_content()
            )
        except SelectorError:
            pass
        """

        return p
Пример #4
0
    def process_page(self):
        p = self.input

        img = CSS("img.leg-img").match_one(self.root).get("src")
        p.image = img

        title = (
            CSS("div .row.profile-top h3").match_one(self.root).text_content().strip()
        )
        if title != "":
            p.extras["title"] = title

        counties = CSS("div .center ul li").match_one(self.root).text_content()
        if re.search(r"\(Part\)", counties):
            counties = re.search(r"(.+)\s\(Part\)", counties).groups()[0]
        counties = counties.split(", ")
        p.extras["counties represented"] = counties

        email = (
            XPath("//div[2]/p[contains(text(), 'Email')]")
            .match_one(self.root)
            .getnext()
            .text_content()
        )
        p.email = email

        addresses = CSS("address").match(self.root)
        for addr in addresses:
            address_clean = " "
            addr_type = addr.getprevious().text_content()
            addr_lst = XPath("text()").match(addr)
            address_clean = address_clean.join(addr_lst)
            if addr_type == "Mailing Address":
                p.extras["mailing address"] = address_clean
            elif addr_type == "Legislative Address":
                p.district_office.address = address_clean
            elif addr_type == "Capitol Address":
                p.capitol_office.address = address_clean

        phones = (
            XPath("//div[2]/p[contains(text(), 'Phone Number(s)')]")
            .match_one(self.root)
            .getnext()
        )
        phones = XPath("text()").match(phones)
        for num in phones:
            kind, num = num.split(": ")
            if kind == "LRC" and num.endswith(" (fax)"):
                fax = num.replace(" (fax)", "")
                p.capitol_office.fax = fax
            elif kind == "LRC":
                p.capitol_office.voice = num
            elif kind == "Home" and num.endswith(" (fax)"):
                fax = num.replace(" (fax)", "")
                p.district_office.fax = fax
            elif kind == "Home":
                p.district_office.voice = num
            elif kind == "Work" and num.endswith(" (fax)"):
                fax = num.replace(" (fax)", "")
                p.extras["fax"] = fax
            elif kind == "Work":
                p.extras["voice"] = num

        try:
            twitter = (
                XPath("//div[2]/p[contains(text(), 'Twitter')]")
                .match_one(self.root)
                .getnext()
                .text_content()
                .lstrip("@")
            )
            p.ids.twitter = twitter
        except SelectorError:
            pass

        try:
            home_city = (
                XPath("//div[2]/p[contains(text(), 'Home City')]")
                .match_one(self.root)
                .getnext()
                .text_content()
            )
            p.extras["home city"] = home_city
        except SelectorError:
            pass

        return p
Пример #5
0
    def process_page(self):
        name = CSS("h1").match(self.root)[0].text_content().strip()

        district = CSS("p.h4").match_one(self.root).text_content().strip()
        if re.search(r"&bullet;", district):
            district = re.search(r"&bullet;(.+)", district).groups()[0].strip()
        if district == "chairman":
            district = "Chairman"

        party = CSS("ul li p").match(self.root)[1].text_content().strip()
        if re.search(r"Party", party):
            party = re.search(r"(.+)\sParty", party).groups()[0]

        p = ScrapePerson(
            name=name,
            party=party,
            district=district,
            state=self.input.state,
            chamber=self.input.chamber,
            image=self.input.image,
        )
        p.add_source(self.input.source1)
        p.add_source(self.input.source2)
        p.add_link(self.input.link, note="homepage")

        addr = CSS("ul li p").match(self.root)[3].text_content().strip()
        p.capitol_office.address = addr

        email = CSS("p.byline a").match(self.root)[0].text_content().strip()
        p.email = email

        phone = CSS("p.byline a").match(self.root)[1].text_content().strip()
        if re.search(r"tel:", phone):
            phone = re.search(r"tel:(.+)", phone).groups()[0]
        p.capitol_office.voice = phone

        all_text = CSS("p.byline").match_one(self.root).text_content().strip()
        fax = all_text.split("Fax: ")[1]
        p.capitol_office.fax = fax

        if len(CSS("section.aside-section a").match(self.root)) == 2:
            # no extra info
            return p
        elif len(CSS("section.aside-section a").match(self.root)) == 3:
            # just a website
            website = CSS("section.aside-section a").match(
                self.root)[2].get("href")
            p.extras["website"] = website
        elif len(CSS("section.aside-section a").match(self.root)) == 4:
            # just fb and twitter
            fb = CSS("section.aside-section a").match(self.root)[2].get("href")
            fb = fb.split("/")[-2]
            twitter = CSS("section.aside-section a").match(
                self.root)[3].get("href")
            twitter = twitter.split("/")[-1]
            p.ids.facebook = fb
            p.ids.twitter = twitter
        else:
            # website, fb, and twitter
            website = CSS("section.aside-section a").match(
                self.root)[2].get("href")
            p.extras["website"] = website
            fb = CSS("section.aside-section a").match(self.root)[3].get("href")
            fb = fb.split("/")
            if fb[-1] == "":
                fb = fb[-2]
            else:
                fb = fb[-1]
            twitter = CSS("section.aside-section a").match(
                self.root)[4].get("href")
            twitter = twitter.split("/")[-1]
            p.ids.facebook = fb
            p.ids.twitter = twitter

        return p