def process_page(self): try: email, legislative_assistant = XPath( "//a[contains(@href, 'mailto')]" ).match(self.root) except ValueError: email = XPath("//a[contains(@href, 'mailto')]").match_one(self.root) image = ( XPath("//img[contains(@src, '/Members/MemberImage')]") .match_one(self.root) .get("src") ) table = XPath("//div[@class='row mx-md-0']/div").match(self.root) # there are different combinations of information the page can have if len(table) == 6: __, terms, __, main_phone, __, assistant = table elif len(table) == 10: __, terms, __, occupation, __, main_phone, __, military, __, __ = table else: __, terms, __, occupation, __, main_phone, __, __ = table p = ScrapePerson( name=self.input.name, state="nc", chamber=self.input.chamber, party=self.input.party, district=self.input.district, email=email.text_content(), image=image, ) address_header = XPath("//h6[@class='mt-3']").match(self.root) address = XPath(".//following-sibling::p").match(address_header[0]) address = address[0].text_content() + "; " + address[1].text_content() main_phone = main_phone.text_content().replace("\r\n", "").strip() # representatives have both legislative office addresses and mailing addresses, # while senators only have mailing addresses try: if address_header[1].text_content() == "Mailing Address:": mailing_address = XPath(".//following-sibling::p").match( address_header[0] ) mailing_address = ( mailing_address[0].text_content() + "; " + mailing_address[1].text_content() ) p.extras["mailing address"] = mailing_address office_number = ( XPath(".//preceding-sibling::p[1]") .match_one(address_header[1]) .text_content() .replace("\r\n", "") .strip() ) # some reps have main phones and capitol office phones, # and senators only have capitol office phones if office_number != main_phone: p.capitol_office.voice = office_number p.extras["main phone"] = main_phone else: p.capitol_office.voice = main_phone except IndexError: p.capitol_office.voice = main_phone p.capitol_office.address = address p.extras["terms in senate"] = ( terms.text_content().replace("( ", "(").replace(" )", ")") ) p.extras["represented counties"] = self.input.counties try: p.extras["legislative assistant"] = legislative_assistant.text_content() p.extras["legislative assistant email"] = legislative_assistant.get( "href" ).split(":")[1] except UnboundLocalError: pass try: p.extras["occupation"] = occupation.text_content() except UnboundLocalError: pass try: p.extras["military experience"] = military.text_content() except UnboundLocalError: pass if self.input.appointment: p.extras["appointment date"] = self.input.appointment p.add_source(self.source.url) p.add_source(self.input.url) for url in XPath( "//nav[contains(@class, 'nav nav-pills')]/a[@class='nav-item nav-link']" ).match(self.root): p.add_link(url.get("href")) return p
def process_item(self, item): name = XPath(".//h3/text()").match(item)[0] if name.endswith(" (R)"): party = "Republican" elif name.endswith(" (D)"): party = "Democratic" else: self.skip("skipping " + name) name = name.split(" (")[0] district = ( XPath('.//div[contains(@class, "senator-district")]/div/text()' ).match(item)[0].strip().lstrip("0")) photo_url = XPath(".//img/@src").match_one(item) p = ScrapePerson( name=name, state="ca", chamber="upper", district=district, party=party, image=photo_url, ) capitol_office = XPath( ".//div[contains(@class, 'views-field-field-senator-capitol-office')]//p" ).match_one(item) capitol_address, capitol_phone = ( capitol_office.text_content().replace(u"\xa0", " ").split("; ")) p.capitol_office.address = capitol_address.strip() p.capitol_office.voice = capitol_phone.strip() district_office = XPath( ".//div[contains(@class, 'views-field-field-senator-district-office')]" ).match_one(item) for line in district_office.text_content().strip().splitlines(): try: if re.search(r"District Offices?", line): continue addr, phone = line.strip().replace(u"\xa0", " ").split("; ") p.add_office( classification="district", address=addr.strip(), voice=phone.strip(), ) except ValueError: # Steven Bradford address/phone separated by period instead of semi-colon if re.search(r"\w+\.\s\(\d{3}\)", line): addr, phone = line.strip().replace(u"\xa0", " ").split(". (") phone = "(" + phone p.add_office( classification="district", address=addr.strip(), voice=phone.strip(), ) url = XPath(".//a/@href").match(item)[0] p.add_link(url) p.add_source(self.source.url) return p