Пример #1
0
    def process_page(self):
        com = self.input
        com.add_source(self.source.url)
        com.add_link(self.source.url, note="homepage")

        room, time = XPath(
            "//div[@class='col-sm-12 pb-2']//p[2]/text()").match(self.root)
        if re.search("On Call", time):
            time = time.split(" -")[0]
        com.extras["room"] = room.strip()
        com.extras["meeting schedule"] = time.strip()

        for link in XPath(
                '//div[contains(@class, "media-body")]//a[contains(@href, "member_bio")]'
        ).match(self.root):
            name = link.text_content().split(",")[0]
            if name:
                try:
                    positions = ("chair", "vice chair",
                                 "ranking minority member")
                    position = XPath("..//preceding-sibling::b/text()").match(
                        link)
                    for role in position:
                        position_str = ""
                        position_str += role.lower()
                        if position_str not in positions:
                            raise ValueError("unknown position")
                except SelectorError:
                    position_str = "member"
            com.add_member(name, position_str)

        return com
Пример #2
0
    def process_page(self):
        party_map = {
            "PNP": "Partido Nuevo Progresista",
            "PPD": u"Partido Popular Democr\xe1tico",
            "PIP": u"Partido Independentista Puertorrique\u00F1o",
        }

        try:
            party = CSS("span.partyBio").match_one(
                self.root).text_content().strip()
            party = party_map[party]
        except SelectorError:
            # HON. LISIE J. BURGOS MUÑIZ, HON. JOSÉ B. MÁRQUEZ REYES, HON. MARIANA NOGALES MOLINELLI
            # do not have their parties listed
            party = "Independent"

        p = ScrapePerson(
            name=self.input.name,
            state="pr",
            chamber="lower",
            district=self.input.district,
            party=party,
        )

        p.add_source(self.input.source)
        p.add_source(self.source.url)
        p.add_link(self.source.url, note="homepage")

        img = CSS("div.container-biography img").match(self.root)[0].get("src")
        p.image = img

        title = CSS("span.name br").match_one(self.root).tail.strip()
        if title != "":
            p.extras["title"] = title

        phones = (CSS("h6 span span span").match(
            self.root)[0].text_content().strip().split("\n"))
        phone1 = re.search(r"Tel\.\s(.+)", phones[0]).groups()[0]
        phone2 = re.search(r"Tel\.\s?(.+)?", phones[1]).groups()[0]
        # http://www.tucamarapr.org/dnncamara/ComposiciondelaCamara/biografia.aspx?rep=251 has an incomplete phone
        if phone1.strip() != "" and phone1.strip() != "(787":
            p.district_office.voice = phone1.strip()
        if phone2 and phone2.strip() != "":
            p.extras["phone 2"] = phone2.strip()

        fax = (CSS("h6 span span span").match(
            self.root)[1].text_content().strip().split("\n"))
        fax1 = re.search(r"Fax\.\s(.+)", fax[0]).groups()[0]
        if fax1.strip() != "":
            p.district_office.fax = fax1.strip()
        tty = re.search(r"TTY\.\s?(.+)?", fax[1]).groups()[0]
        if tty and tty.strip() != "":
            p.extras["TTY"] = tty

        # these addresses do not look complete but capturing them anyway
        addr = XPath(
            "//*[@id='dnn_ctr1108_ViewWebRepresentatives_WebRepresentatives1_pnlRepresentative']/h6/text()[1]"
        ).match_one(self.root)
        if addr != "":
            p.district_office.address = addr.strip()

        return p