def process_page(self): com = self.input com.add_source(self.source.url) com.add_link(self.source.url, note="homepage") room, time = XPath( "//div[@class='col-sm-12 pb-2']//p[2]/text()").match(self.root) if re.search("On Call", time): time = time.split(" -")[0] com.extras["room"] = room.strip() com.extras["meeting schedule"] = time.strip() for link in XPath( '//div[contains(@class, "media-body")]//a[contains(@href, "member_bio")]' ).match(self.root): name = link.text_content().split(",")[0] if name: try: positions = ("chair", "vice chair", "ranking minority member") position = XPath("..//preceding-sibling::b/text()").match( link) for role in position: position_str = "" position_str += role.lower() if position_str not in positions: raise ValueError("unknown position") except SelectorError: position_str = "member" com.add_member(name, position_str) return com
def process_page(self): party_map = { "PNP": "Partido Nuevo Progresista", "PPD": u"Partido Popular Democr\xe1tico", "PIP": u"Partido Independentista Puertorrique\u00F1o", } try: party = CSS("span.partyBio").match_one( self.root).text_content().strip() party = party_map[party] except SelectorError: # HON. LISIE J. BURGOS MUÑIZ, HON. JOSÉ B. MÁRQUEZ REYES, HON. MARIANA NOGALES MOLINELLI # do not have their parties listed party = "Independent" p = ScrapePerson( name=self.input.name, state="pr", chamber="lower", district=self.input.district, party=party, ) p.add_source(self.input.source) p.add_source(self.source.url) p.add_link(self.source.url, note="homepage") img = CSS("div.container-biography img").match(self.root)[0].get("src") p.image = img title = CSS("span.name br").match_one(self.root).tail.strip() if title != "": p.extras["title"] = title phones = (CSS("h6 span span span").match( self.root)[0].text_content().strip().split("\n")) phone1 = re.search(r"Tel\.\s(.+)", phones[0]).groups()[0] phone2 = re.search(r"Tel\.\s?(.+)?", phones[1]).groups()[0] # http://www.tucamarapr.org/dnncamara/ComposiciondelaCamara/biografia.aspx?rep=251 has an incomplete phone if phone1.strip() != "" and phone1.strip() != "(787": p.district_office.voice = phone1.strip() if phone2 and phone2.strip() != "": p.extras["phone 2"] = phone2.strip() fax = (CSS("h6 span span span").match( self.root)[1].text_content().strip().split("\n")) fax1 = re.search(r"Fax\.\s(.+)", fax[0]).groups()[0] if fax1.strip() != "": p.district_office.fax = fax1.strip() tty = re.search(r"TTY\.\s?(.+)?", fax[1]).groups()[0] if tty and tty.strip() != "": p.extras["TTY"] = tty # these addresses do not look complete but capturing them anyway addr = XPath( "//*[@id='dnn_ctr1108_ViewWebRepresentatives_WebRepresentatives1_pnlRepresentative']/h6/text()[1]" ).match_one(self.root) if addr != "": p.district_office.address = addr.strip() return p