def process_item(self, item): name_dirty = CSS("h4 span").match_one(item).text_content().strip() if re.search(r"Vacant", name_dirty): self.skip() name_dirty = name_dirty.split(", ") last_name = name_dirty[0] first_name = name_dirty[1] name = first_name + " " + last_name district = CSS("i.fa.fa-map").match_one( item).getnext().text_content().strip() party = CSS("i.fa.fa-users").match_one( item).getnext().text_content().strip() if party == "Democrat": party = "Democratic" email = CSS("a").match(item)[2].text_content().strip() img = CSS("img").match_one(item).get("src") p = ScrapePerson( name=name, state="la", party=party, district=district, chamber=self.chamber, email=email, image=img, ) detail_link = CSS("a").match(item)[1].get("href") p.add_source(self.source.url) p.add_source(detail_link) p.add_link(detail_link, note="homepage") return LegislatorDetail(p, source=detail_link)
def process_item(self, item): name = CSS(".mediaCaptionTitle").match_one(item).text subtitle = CSS(".mediaCaptionSubtitle").match_one(item).text image = CSS(".photo").match_one(item).get("style") image = background_image_re.findall(image)[0] # e.g. District 25 | D district, party = subtitle.split(" | ") district = district.split()[1] party = {"D": "Democratic", "R": "Republican"}[party] return HouseDetail( HousePartial( name=name, district=district, party=party, url=item.get("href"), image=image, ) )
def process_page(self): p = self.input email = CSS("div.sen-contact a").match(self.root)[0].get("href") email = re.search(r"mailto:(.+)", email).groups()[0] p.email = email addr = CSS("div.sen-contact p").match(self.root)[0].text_content() # no phone for this link if self.source.url == "https://www.indianasenaterepublicans.com/young": addr = addr phone1 = None phone2 = None else: addr, phone1, phone2 = re.search( r"(.+)Phone:\s(\d{3}-\d{3}-\d{4})\s?or\s(\d{3}-\d{3}-\d{4})", addr ).groups() p.capitol_office.address = addr if phone1: p.capitol_office.voice = phone1 if phone2: p.extras["second phone"] = phone2 if len(CSS("div.sen-contact p").match(self.root)) == 1: leg_assist = CSS("div.sen-contact p").match_one(self.root).text_content() else: leg_assist = CSS("div.sen-contact p").match(self.root)[1].text_content() if len(CSS("div.sen-contact p").match(self.root)) < 3: extra_contacts = leg_assist.split("Media Contact:") leg_assist = extra_contacts[0] media_contact = extra_contacts[1] leg_assist_name, leg_assist_phone, leg_assist_email = re.search( r"Legislative\sAssistant:?(.+)Phone:\s(.+)Email:\s(.+)", leg_assist ).groups() media_contact_name, media_contact_phone, media_contact_email = re.search( r"(.+)Phone:\s(.+)Email:\s(.+)", media_contact ).groups() elif ( len(CSS("div.sen-contact p").match(self.root)) == 3 or self.source.url == "https://www.indianasenaterepublicans.com/bray" ): leg_assist_name, leg_assist_phone, leg_assist_email = re.search( r"Legislative\sAssistant:?(.+)Phone:\s(.+)Email:\s(.+)", leg_assist ).groups() media_contact = CSS("div.sen-contact p").match(self.root)[2].text_content() media_contact_name, media_contact_phone, media_contact_email = re.search( r"Media\sContact:(.+)Phone:\s(.+)Email:\s(.+)", media_contact ).groups() else: leg_assist_name, leg_assist_phone, leg_assist_email = re.search( r"Legislative\sAssistant:?(.+)Phone:\s(.+)Email:\s(.+)", leg_assist ).groups() media_contact = CSS("div.sen-contact p").match(self.root)[3].text_content() media_contact_name, media_contact_phone, media_contact_email = re.search( r"Media\sContact:(.+)Phone:\s(.+)Email:\s(.+)", media_contact ).groups() p.extras["legislative assistant name"] = leg_assist_name p.extras["legislative assistant phone"] = leg_assist_phone p.extras["legislative assistant email"] = leg_assist_email p.extras["media contact name"] = media_contact_name p.extras["media contact phone"] = media_contact_phone p.extras["media contact email"] = media_contact_email """ try: # need to deal with multi-lines of education print( XPath("//h3[contains(text(), 'Education')]") .match(self.root)[0] .getnext() .text_content() ) except SelectorError: pass """ return p
def process_page(self): p = self.input img = CSS("img.leg-img").match_one(self.root).get("src") p.image = img title = ( CSS("div .row.profile-top h3").match_one(self.root).text_content().strip() ) if title != "": p.extras["title"] = title counties = CSS("div .center ul li").match_one(self.root).text_content() if re.search(r"\(Part\)", counties): counties = re.search(r"(.+)\s\(Part\)", counties).groups()[0] counties = counties.split(", ") p.extras["counties represented"] = counties email = ( XPath("//div[2]/p[contains(text(), 'Email')]") .match_one(self.root) .getnext() .text_content() ) p.email = email addresses = CSS("address").match(self.root) for addr in addresses: address_clean = " " addr_type = addr.getprevious().text_content() addr_lst = XPath("text()").match(addr) address_clean = address_clean.join(addr_lst) if addr_type == "Mailing Address": p.extras["mailing address"] = address_clean elif addr_type == "Legislative Address": p.district_office.address = address_clean elif addr_type == "Capitol Address": p.capitol_office.address = address_clean phones = ( XPath("//div[2]/p[contains(text(), 'Phone Number(s)')]") .match_one(self.root) .getnext() ) phones = XPath("text()").match(phones) for num in phones: kind, num = num.split(": ") if kind == "LRC" and num.endswith(" (fax)"): fax = num.replace(" (fax)", "") p.capitol_office.fax = fax elif kind == "LRC": p.capitol_office.voice = num elif kind == "Home" and num.endswith(" (fax)"): fax = num.replace(" (fax)", "") p.district_office.fax = fax elif kind == "Home": p.district_office.voice = num elif kind == "Work" and num.endswith(" (fax)"): fax = num.replace(" (fax)", "") p.extras["fax"] = fax elif kind == "Work": p.extras["voice"] = num try: twitter = ( XPath("//div[2]/p[contains(text(), 'Twitter')]") .match_one(self.root) .getnext() .text_content() .lstrip("@") ) p.ids.twitter = twitter except SelectorError: pass try: home_city = ( XPath("//div[2]/p[contains(text(), 'Home City')]") .match_one(self.root) .getnext() .text_content() ) p.extras["home city"] = home_city except SelectorError: pass return p
def process_page(self): name = CSS("h1").match(self.root)[0].text_content().strip() district = CSS("p.h4").match_one(self.root).text_content().strip() if re.search(r"•", district): district = re.search(r"•(.+)", district).groups()[0].strip() if district == "chairman": district = "Chairman" party = CSS("ul li p").match(self.root)[1].text_content().strip() if re.search(r"Party", party): party = re.search(r"(.+)\sParty", party).groups()[0] p = ScrapePerson( name=name, party=party, district=district, state=self.input.state, chamber=self.input.chamber, image=self.input.image, ) p.add_source(self.input.source1) p.add_source(self.input.source2) p.add_link(self.input.link, note="homepage") addr = CSS("ul li p").match(self.root)[3].text_content().strip() p.capitol_office.address = addr email = CSS("p.byline a").match(self.root)[0].text_content().strip() p.email = email phone = CSS("p.byline a").match(self.root)[1].text_content().strip() if re.search(r"tel:", phone): phone = re.search(r"tel:(.+)", phone).groups()[0] p.capitol_office.voice = phone all_text = CSS("p.byline").match_one(self.root).text_content().strip() fax = all_text.split("Fax: ")[1] p.capitol_office.fax = fax if len(CSS("section.aside-section a").match(self.root)) == 2: # no extra info return p elif len(CSS("section.aside-section a").match(self.root)) == 3: # just a website website = CSS("section.aside-section a").match( self.root)[2].get("href") p.extras["website"] = website elif len(CSS("section.aside-section a").match(self.root)) == 4: # just fb and twitter fb = CSS("section.aside-section a").match(self.root)[2].get("href") fb = fb.split("/")[-2] twitter = CSS("section.aside-section a").match( self.root)[3].get("href") twitter = twitter.split("/")[-1] p.ids.facebook = fb p.ids.twitter = twitter else: # website, fb, and twitter website = CSS("section.aside-section a").match( self.root)[2].get("href") p.extras["website"] = website fb = CSS("section.aside-section a").match(self.root)[3].get("href") fb = fb.split("/") if fb[-1] == "": fb = fb[-2] else: fb = fb[-1] twitter = CSS("section.aside-section a").match( self.root)[4].get("href") twitter = twitter.split("/")[-1] p.ids.facebook = fb p.ids.twitter = twitter return p