def scrape_member(self, chamber, link): name = link.text.strip() leg_url = link.get('href') district = link.xpath("string(../../td[3])") party = link.xpath("string(../../td[4])") # we get email on the next page now # email = link.xpath("string(../../td[5])") if party == 'Democrat': party = 'Democratic' elif party == 'No Party Specified': party = 'Independent' pid = re.search(r"personID=(\d+)", link.attrib['href']).group(1) photo_url = ("https://www.legis.iowa.gov/photo" "?action=getPhoto&ga=%s&pid=%s" % (self.latest_session(), pid)) leg = Person(name=name, primary_org=chamber, district=district, party=party, image=photo_url) leg.add_link(leg_url) leg.add_source(leg_url) leg_page = lxml.html.fromstring(self.get(link.attrib['href']).text) self.scrape_member_page(leg, leg_page) yield leg
def scrape(self): urls = Urls(dict(list=legislators_url), self) council = Organization('Temecula City Council', classification='legislature') council.add_source(urls.list.url) yield council for tr in urls.list.xpath('//table[2]//tr')[1:]: # Parse some attributes. name, role = tr.xpath('td/p[1]//font/text()') image = tr.xpath('td/img/@src').pop() # Create legislator. person = Person(name, image=image) # Add membership on council. memb = person.add_membership(council, role=role) # Add email address. email, detail_url = tr.xpath('td//a/@href') email = email[7:] memb.contact_details.append( dict(type='email', value=email, note='work')) # Add sources. person.add_source(urls.list.url) person.add_source(detail_url) yield person
def scrape_legislator(self, chamber, name, url): html = self.get(url).text page = lxml.html.fromstring(html) page.make_links_absolute(url) district = page.xpath('//h1[contains(., "DISTRICT")]/text()').pop() \ .split()[1].strip().lstrip('0') party = page.xpath('//h2').pop().text_content() party = re.search(r'\((R|D|I)[ \-\]]', party).group(1) if party == 'D': party = 'Democratic' elif party == 'R': party = 'Republican' elif party == 'I': party = 'Independent' photo_url = page.xpath( "//img[contains(@src, 'images/members/')]")[0].attrib['src'] leg = Person(name, district=district, party=party, image=photo_url, primary_org=chamber) leg.add_link(url) leg.add_source(url) self.scrape_offices(leg, page) yield leg
def test_committee_add_member_person(): c = Organization('Defense', classification='committee') p = Person('John Adams') c.add_member(p, role='chairman') assert c._related[0].person_id == p._id assert c._related[0].organization_id == c._id assert c._related[0].role == 'chairman'
def scrape_alderman(self, ward_num): ward_url = "{}/ward-{}".format(Urls.ALDERMEN_HOME, ward_num) alderman_url = self.alderman_url(ward_url) alderman_page = self.lxmlize(alderman_url) # person's name is the only <h1> tag on the page raw_name = alderman_page.xpath("//h1/text()")[0] name = HumanName.name_firstandlast(raw_name) # initialize person object with appropriate data so that pupa can # automatically create a membership object linking this person to # a post in the jurisdiction's "Board of Aldermen" organization district = "Ward {} Alderman".format(ward_num) person = Person(name=name, district=district, role="Alderman", primary_org="legislature") # set additional fields person.image = alderman_page.xpath("//div/img/@src")[0] phone_number = alderman_page.xpath( "//strong[text()='Phone:']/../text()")[1].strip() person.add_contact_detail(type="voice", value=phone_number) # add sources person.add_source(alderman_url, note="profile") person.add_source(ward_url, note="ward") return person
def handle_list_item(self, item): name = " ".join(item.xpath('.//text()')) name = re.sub(r'\s+', " ", name).replace(" ,", ",").strip() if 'Vacant' in name: return district = item.xpath("string(../../td[1])") party = item.xpath("string(../../td[2])") if party == 'Democrat': party = 'Democratic' leg_url = item.get('href') leg = Person(name=name, district=district, party=party, primary_org='upper', role='Senator') leg.add_link(leg_url) leg.add_source(self.url) leg.add_source(leg_url) self.scrape_page(SenDetail, leg_url, obj=leg) return leg
def handle_list_item(self, item): link = item.xpath('.//div[contains(@class, "rep_style")]/a')[0] name = link.text_content().strip() if 'Vacant' in name or 'Resigned' in name or 'Pending' in name: return party = item.xpath( './/div[contains(@class, "party_style")]/text()')[0].strip() party = {'D': 'Democratic', 'R': 'Republican'}[party] district = item.xpath( './/div[contains(@class, "district_style")]/text()')[0].strip() leg_url = link.get('href') split_url = parse.urlsplit(leg_url) member_id = parse.parse_qs(split_url.query)['MemberId'][0] image = "http://www.flhouse.gov/FileStores/Web/Imaging/Member/{}.jpg".format( member_id) rep = Person(name=name, district=district, party=party, primary_org='lower', role='Representative', image=image) rep.add_link(leg_url) rep.add_source(leg_url) rep.add_source(self.url) self.scrape_page(RepDetail, leg_url, obj=rep) return rep
def test_save_object_invalid(): s = Scraper('jurisdiction', '/tmp/') p = Person('Michael Jordan') # no source, won't validate with pytest.raises(ValueError): s.save_object(p)
def scrape_upper_chamber(self, term): url = "http://oksenate.gov/Senators/Default.aspx" html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) for a in doc.xpath('//table[@summary]')[0]. \ xpath('.//td//a[contains(@href, "biographies")]'): tail = a.xpath('..')[0].tail if tail: district = tail.split()[1] else: district = a.xpath('../../span')[1].text.split()[1] if a.text is None or a.text.strip() == 'Vacant': self.warning( "District {} appears to be empty".format(district)) continue else: match = re.match(r'(.+) \(([A-Z])\)', a.text.strip()) name, party = match.group(1), self._parties[match.group(2)] url = a.get('href') person = Person( primary_org='upper', district=district, name=name.strip(), party=party, ) person.add_link(url) person.add_source(url) self.scrape_upper_offices(person, url) yield person
def scrape_senator_page(self, chamber, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) for legislator in page.xpath( "//div[@id='senators']//div[contains(concat(' ', normalize-space(@class), ' '), " "' portraitContainer ')]"): img = legislator.xpath( ".//div[@class='profileThumbnailBoundingBox']/@style")[0] img = img[img.find("(") + 1:img.find(")")] full_name = legislator.xpath( ".//div[@class='profileName']/a/text()")[0] homepage_url = legislator.xpath( ".//a[@class='profileImageLink']")[0].attrib["href"] district = legislator.xpath(".//div[@class='profileDistrict']" "/a/text()")[0].split("#")[1] if "Vacant" in full_name: continue homepage = self.get(homepage_url).text page = lxml.html.fromstring(homepage) phone = page.xpath("//div[@class='phone']/span/text()")[0] address_lines = page.xpath("//div[@class='address']/span/text()") address = "\n".join(address_lines) party_image = page.xpath( '//div[@class="senatorParty"]/img/@src')[0] if "Republican" in party_image: party = "Republican" elif "Democrat" in party_image: party = "Democratic" email = ("rep{0:0{width}}@ohiohouse.gov" if chamber == "lower" else "sd{0:0{width}}@ohiosenate.gov").format(int(district), width=2) leg = Person( name=full_name, district=district, primary_org=chamber, image=img, party=party, ) leg.add_contact_detail(type="address", value=address, note="Capitol Office") leg.add_contact_detail(type="voice", value=phone, note="Capitol Office") leg.add_contact_detail(type="email", value=email, note="Capitol Office") leg.add_source(url) leg.add_link(homepage_url) yield leg
def scrape(self): # lower url = 'http://164.100.47.194/Loksabha/Members/AlphabeticalList.aspx' entry = self.get(url).content page = lxml.html.fromstring(entry) page.make_links_absolute(url) for tr in page.xpath( '//table[contains(@class,"member_list_table")]/tr'): name = tr.xpath('td[2]/a[1]/@title')[0] bio = tr.xpath('td[2]/a[1]/@href')[0] photo_url = tr.xpath('td[2]/a[1]/img/@src')[0] party = tr.xpath('td[3]/text()')[0].strip() state = tr.xpath('td[4]/text()')[0].strip() member = Person(name=name, role="member", primary_org="lower", party=party, image=photo_url, district=state) member.add_source('http://164.100.47.5/Newmembers/memberlist.aspx') yield member # upper url = 'http://164.100.47.5/Newmembers/memberlist.aspx' entry = self.get(url).content page = lxml.html.fromstring(entry) page.make_links_absolute(url) for tr in page.xpath( '//table[@id="ContentPlaceHolder1_GridView2"]/tr')[1:]: name = tr.xpath('td[2]/font/a/text()')[0] party_abbr = tr.xpath('td[3]/font/text()')[0].strip() state = tr.xpath('td[4]/font/text()')[0].strip() member = Person(name=name, role="member", primary_org="upper", party=party_abbr, district=state) member.add_source('http://164.100.47.5/Newmembers/memberlist.aspx') yield member
def scrape_chamber(self, chamber): self._party_map = { 'Democrat': 'Democratic', 'Republican': 'Republican', 'Non Affiliated': 'Independent', 'Not Affiliated': 'Independent', } if chamber == 'upper': url = 'http://senate.legis.state.ak.us/' else: url = 'http://house.legis.state.ak.us/' page = self.lxmlize(url) items = page.xpath('//ul[@class="item"]')[1].getchildren() for item in items: photo_url = item.xpath('.//img/@src')[0] name = item.xpath('.//strong/text()')[0] leg_url = item.xpath('.//a/@href')[0] email = item.xpath('.//a[text()="Email Me"]/@href') if email: email = email[0].replace('mailto:', '') else: self.warning('no email for ' + name) party = district = None skip = False for dt in item.xpath('.//dt'): dd = dt.xpath('following-sibling::dd')[0].text_content() label = dt.text.strip() if label == 'Party:': party = dd elif label == 'District:': district = dd elif label.startswith('Deceased'): skip = True self.warning('skipping deceased ' + name) break if skip: continue person = Person( primary_org=chamber, district=district, name=name, party=self._party_map[party], image=photo_url, ) person.add_source(leg_url) person.add_link(leg_url) # scrape offices self._scrape_offices(person, leg_url, email) yield person
def scrape_upper_chamber(self, term): url = 'https://senado.pr.gov/Pages/Senadores.aspx' doc = self.lxmlize(url) links = self.get_nodes(doc, '//ul[@class="senadores-list"]/li/a/@href') for link in links: senator_page = self.lxmlize(link) profile_links = self.get_nodes(senator_page, '//ul[@class="profiles-links"]/li') name_text = self.get_node( senator_page, '//span[@class="name"]').text_content().strip() # Convert to title case as some names are in all-caps name = re.sub(r'^Hon\.', '', name_text, flags=re.IGNORECASE).strip().title() party = profile_links[0].text_content().strip() # Translate to English since being an Independent is a universal construct if party == "Independiente": party = "Independent" photo_url = self.get_node(senator_page, '//div[@class="avatar"]//img/@src') if profile_links[1].text_content().strip( ) == "Senador por Distrito": district_text = self.get_node( senator_page, '//div[@class="module-distrito"]//span[@class="headline"]' ).text_content() district = district_text.replace('DISTRITO', '', 1).replace('\u200b', '').strip() elif profile_links[1].text_content().strip( ) == "Senador por Acumulación": district = "At-Large" phone_node = self.get_node(senator_page, '//a[@class="contact-data tel"]') phone = phone_node.text_content().strip() email_node = self.get_node(senator_page, '//a[@class="contact-data email"]') email = email_node.text_content().replace('\u200b', '').strip() person = Person(primary_org='upper', district=district, name=name, party=party, image=photo_url) person.add_contact_detail(type='email', value=email, note='Capitol Office') person.add_contact_detail(type='voice', value=phone, note='Capitol Office') person.add_link(link) person.add_source(link) yield person
def test_person_add_party(): p = Person('Groot') p.add_party('Green') p._related[0].validate() assert get_pseudo_id(p._related[0].organization_id) == { 'name': 'Green', 'classification': 'party' }
def scrape_chamber(self, chamber): client = ApiClient(self) session = self.latest_session() base_url = "http://iga.in.gov/legislative" api_base_url = "https://api.iga.in.gov" chamber_name = "senate" if chamber == "upper" else "house" r = client.get("chamber_legislators", session=session, chamber=chamber_name) all_pages = client.unpaginate(r) for leg in all_pages: firstname = leg["firstName"] lastname = leg["lastName"] party = leg["party"] link = leg["link"] api_link = api_base_url + link html_link = base_url + link.replace("legislators/", "legislators/legislator_") try: html = get_with_increasing_timeout(self, html_link, fail=True, kwargs={"verify": False}) except scrapelib.HTTPError: self.logger.warning("Legislator's page is not available.") continue doc = lxml.html.fromstring(html.text) doc.make_links_absolute(html_link) address, phone = doc.xpath("//address") address = address.text_content().strip() address = "\n".join([l.strip() for l in address.split("\n")]) phone = phone.text_content().strip() try: district = (doc.xpath("//span[@class='district-heading']") [0].text.lower().replace("district", "").strip()) except IndexError: self.warning("skipping legislator w/o district") continue image_link = base_url + link.replace("legislators/", "portraits/legislator_") legislator = Person( primary_org=chamber, district=district, name=" ".join([firstname, lastname]), party=party, image=image_link, ) legislator.add_contact_detail(type="address", note="Capitol Office", value=address) legislator.add_contact_detail(type="voice", note="Capitol Office", value=phone) legislator.add_link(html_link) legislator.add_source(html_link) legislator.add_source(api_link) yield legislator
def scrape_chamber(self, chamber): """ Scrapes legislators for the current term only """ # self.validate_term(term, latest_only=True) url = BASE_URL % CHAMBERS[chamber].lower() index = self.get(url, verify=False).text html = lxml.html.fromstring(index) html.make_links_absolute(url) rows = html.xpath('//div[contains(@class, "row-equal-height")]') for row in rows: img_url = row.xpath('.//img/@src')[0] inner = row.xpath('.//div[@class="vc-column-innner-wrapper"]')[1] inner_text = inner.text_content() if 'Resigned' in inner_text or 'Substitute' in inner_text: continue name = inner.xpath('p/strong')[0].text.replace(u'\xa0', ' ').strip() name = re.sub('\s+', ' ', name) party = PARTY[inner.xpath('p/strong')[0].tail.strip()] email = inner.xpath('p/strong/a')[0].text district = inner.xpath('p/a')[0].text.replace('District ', '') person_url = inner.xpath('p/a/@href')[0] # skip roles for now role = '' # for com in inner.xpath('p/a[contains(@href, "committees")]'): # role = com.tail.strip() person = Person(name=name, district=district, party=party, primary_org=chamber, image=img_url, role=role) phones = get_phones(inner) phone = phones.get('home') or phones.get('business') office_phone = phones.get('office') address = get_address(inner) fax = get_fax(inner) if address: person.add_contact_detail(type='address', value=address, note='District Office') if phone: person.add_contact_detail(type='voice', value=phone, note='District Office') if fax: person.add_contact_detail(type='fax', value=fax, note='District Office') if email: person.add_contact_detail(type='email', value=email, note='District Office') if office_phone: person.add_contact_detail(type='voice', value=office_phone, note='Capitol Office') person.add_source(url) person.add_link(person_url) yield person
def test_basic_invalid_person(): bob = Person("Bob B. Johnson") bob.add_source(url='foo') bob.validate() bob.name = None with pytest.raises(ValidationError): bob.validate()
def scrape_lower(self, chamber): url = 'http://www.house.mi.gov/mhrpublic/frmRepList.aspx' table = [ "website", "district", "name", "party", "location", "phone", "email" ] data = self.get(url).text doc = lxml.html.fromstring(data) # skip two rows at top for row in doc.xpath('//table[@id="grvRepInfo"]/*'): tds = row.xpath('.//td') if len(tds) == 0: continue metainf = {} for i in range(0, len(table)): metainf[table[i]] = tds[i] district = str(int(metainf['district'].text_content().strip())) party = metainf['party'].text_content().strip() phone = metainf['phone'].text_content().strip() email = metainf['email'].text_content().strip() leg_url = metainf['website'].xpath("./a")[0].attrib['href'] name = metainf['name'].text_content().strip() if name == 'Vacant' or re.match(r'^District \d{1,3}$', name): self.warning('District {} appears vacant, and will be skipped'.format(district)) continue office = metainf['location'].text_content().strip() office = re.sub( ' HOB', ' Anderson House Office Building\n124 North Capitol Avenue\nLansing, MI 48933', office ) office = re.sub( ' CB', ' State Capitol Building\nLansing, MI 48909', office ) photo_url = self.get_photo_url(leg_url) person = Person(name=name, district=district, party=abbr[party], primary_org='lower', image=photo_url[0] if photo_url else None) person.add_link(leg_url) person.add_source(leg_url) person.add_contact_detail(type='address', value=office, note='Capitol Office') person.add_contact_detail(type='voice', value=phone, note='Capitol Office') person.add_contact_detail(type='email', value=email, note='Capitol Office') yield person
def get_member(self, session, chamber, kpid): url = "%smembers/%s" % (ksapi.url, kpid) content = json.loads(self.get(url).text)["content"] party = content["PARTY"] if party == "Democrat": party = "Democratic" slug = { "2013-2014": "b2013_14", "2015-2016": "b2015_16", "2017-2018": "b2017_18", "2019-2020": "b2019_20", }[session] leg_url = "http://www.kslegislature.org/li/%s/members/%s/" % (slug, kpid) try: legislator_page = self.lxmlize(leg_url) (photo_url, ) = legislator_page.xpath('//img[@class="profile-picture"]/@src') except scrapelib.HTTPError: self.warning("{}'s legislator bio page not found".format( content["FULLNAME"])) leg_url = "" photo_url = "" person = Person( name=content["FULLNAME"], district=str(content["DISTRICT"]), primary_org=chamber, party=party, image=photo_url, ) person.extras = {"occupation": content["OCCUPATION"]} address = "\n".join([ "Room {}".format(content["OFFICENUM"]), "Kansas State Capitol Building", "300 SW 10th St.", "Topeka, KS 66612", ]) note = "Capitol Office" person.add_contact_detail(type="address", value=address, note=note) person.add_contact_detail(type="email", value=content["EMAIL"], note=note) if content["OFFPH"]: person.add_contact_detail(type="voice", value=content["OFFPH"], note=note) person.add_source(url) person.add_link(leg_url) yield person
def test_basic_invalid_person(): bob = Person("Bob B. Johnson") bob.add_source(url='http://example.com') bob.validate() bob.name = None with pytest.raises(ScrapeValueError): bob.validate()
def test_person_add_term(): p = Person('Eternal') p.add_term('eternal', 'council', start_date='0001', end_date='9999') p._related[0].validate() assert get_pseudo_id(p._related[0].organization_id) == { 'classification': 'council', } assert p._related[0].start_date == '0001' assert p._related[0].end_date == '9999'
def test_legislator_related_chamber_district(): leg = Person('John Adams', district='1', primary_org='upper') leg.pre_save('jurisdiction-id') assert len(leg._related) == 1 assert leg._related[0].person_id == leg._id assert get_pseudo_id(leg._related[0].organization_id) == {'classification': 'upper'} assert get_pseudo_id(leg._related[0].post_id) == {"organization__classification": "upper", "label": "1"}
def handle_list_item(self, row): if not row['First Name']: return name = '{} {}'.format(row['First Name'], row['Last Name']) party = PARTIES[row['Party']] leg = Person(name=name, district=row['District'].lstrip('0'), party=party, primary_org='upper', role='Senator', image=self.extra_info[name]['image']) leg.add_link(self.extra_info[name]['url']) leg.add_contact_detail(type='voice', value=self.extra_info[name]['office_phone'], note='capitol') if 'email' in self.extra_info[name]: leg.add_contact_detail(type='email', value=self.extra_info[name]['email'], note='capitol') row['Zipcode'] = row['Zipcode'].strip() # Accommodate for multiple address column naming conventions. address1_fields = [row.get('Address'), row.get('Office Building')] address2_fields = [row.get('Address2'), row.get('Office Address')] row['Address'] = next((a for a in address1_fields if a is not None), False) row['Address2'] = next((a for a in address2_fields if a is not None), False) if (a in row['Address2'] for a in ['95 University Avenue W', '100 Rev. Dr. Martin Luther King']): address = ( '{Address}\n{Address2}\n{City}, {State} {Zipcode}'.format( **row)) if 'Rm. Number' in row: address = '{0} {1}'.format(row['Rm. Number'], address) leg.add_contact_detail(type='address', value=address, note='capitol') elif row['Address2']: address = ( '{Address}\n{Address2}\n{City}, {State} {Zipcode}'.format( **row)) leg.add_contact_detail(type='address', value=address, note='district') else: address = '{Address}\n{City}, {State} {Zipcode}'.format(**row) leg.add_contact_detail(type='address', value=address, note='district') leg.add_source(self.url) leg.add_source(self._html_url) return leg
def scrape_member_page(self, chamber, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) for legislator in page.xpath( "//div[contains(concat(' ', normalize-space(@class), ' '), " "' memberModule ')]" ): img = legislator.xpath( ".//div[@class='thumbnail']//img")[0].attrib['src'] data = legislator.xpath(".//div[@class='data']")[0] homepage = data.xpath(".//a[@class='black']")[0] full_name = homepage.text_content() if "Vacant" in full_name: continue homepage = homepage.attrib['href'] party = data.xpath( ".//span[@class='partyLetter']")[0].text_content() party = {"R": "Republican", "D": "Democratic"}[party] office_lines = data.xpath("child::text()") phone = office_lines.pop(-1) office = "\n".join(office_lines) h3 = data.xpath("./h3") if len(h3): h3 = h3[0] district = h3.xpath("./br")[0].tail.replace("District", "" ).strip() else: district = re.findall( r"\d+\.png", legislator.attrib['style'] )[-1].split(".", 1)[0] full_name = re.sub(r"\s+", " ", full_name).strip() email = ( 'rep{0:0{width}}@ohiohouse.gov' if chamber == 'lower' else 'sd{0:0{width}}@ohiosenate.gov' ).format(int(district), width=2) leg = Person(name=full_name, district=district, party=party, primary_org=chamber, image=img) leg.add_contact_detail(type='address', value=office, note='Capitol Office') leg.add_contact_detail(type='voice', value=phone, note='Capitol Office') leg.add_contact_detail(type='email', value=email, note='Capitol Office') self.scrape_homepage(leg, chamber, homepage) leg.add_source(url) leg.add_link(homepage) yield leg
def get_member(self, session, chamber, kpid): url = '%smembers/%s' % (ksapi.url, kpid) content = json.loads(self.get(url).text)['content'] party = content['PARTY'] if party == 'Democrat': party = 'Democratic' slug = { '2013-2014': 'b2013_14', '2015-2016': 'b2015_16', '2017-2018': 'b2017_18' }[session] leg_url = 'http://www.kslegislature.org/li/%s/members/%s/' % (slug, kpid) try: legislator_page = self.lxmlize(leg_url) photo_url, = legislator_page.xpath( '//img[@class="profile-picture"]/@src') except scrapelib.HTTPError: self.warning("{}'s legislator bio page not found".format( content['FULLNAME'])) leg_url = '' photo_url = '' person = Person( name=content['FULLNAME'], district=str(content['DISTRICT']), primary_org=chamber, party=party, image=photo_url, ) person.extras = {'occupation': content['OCCUPATION']} address = '\n'.join([ 'Room {}'.format(content['OFFICENUM']), 'Kansas State Capitol Building', '300 SW 10th St.', 'Topeka, KS 66612', ]) note = 'Capitol Office' person.add_contact_detail(type='address', value=address, note=note) person.add_contact_detail(type='email', value=content['EMAIL'], note=note) if content['OFFPH']: person.add_contact_detail(type='voice', value=content['OFFPH'], note=note) person.add_source(url) person.add_link(leg_url) yield person
def _scrape_legislator(self, row, chamber): name_cell = row.xpath('./td[@class="rosterCell nameCell"]/a')[0] name = " ".join([ line.strip() for line in name_cell.text_content().split("\n") if len(line.strip()) > 0 ]) party_letter = row.xpath( './td[@class="rosterCell partyCell"]/text()')[0].strip() party = dict(D="Democratic", R="Republican")[party_letter] chamber_abbr = self._chamber_map[chamber] district = (row.xpath('./td[@class="rosterCell seatCell"]' "/text()")[0].replace(chamber_abbr, "").strip()) try: email = (row.xpath('./td[@class="rosterCell emailCell"]' "/a/@href")[0].replace("mailto:", "").strip()) except IndexError: email = None phone = (row.xpath('./td[@class="rosterCell phoneCell"]' "/text()")[0].strip() or None) details_url = "https://leg.mt.gov{}".format(name_cell.attrib["href"]) response = self.get(details_url) details_page = lxml.html.fromstring(response.text) address_lines = (details_page.xpath( '//div[@class="col-lg-6 col-md-12 text-lg-left align-self-center"]' '/p[contains(text(), "Address")]')[0].text_content().replace( "Address", "").split("\n")) address = "\n".join( [line.strip() for line in address_lines if len(line.strip()) > 0]) legislator = Person(name=name, district=district, party=party, primary_org=chamber) legislator.add_contact_detail(type="address", value=address, note="Capitol Office") if phone is not None: legislator.add_contact_detail(type="voice", value=phone, note="Capitol Office") if email is not None: legislator.add_contact_detail(type="email", value=email, note="E-mail") legislator.add_link(details_url) legislator.add_source(self._roster_url) yield legislator
def test_person_add_membership_name(): p = Person('Leonardo DiCaprio') p.add_membership('Academy of Motion Picture Arts and Sciences', role='winner', start_date='2016') p._related[0].validate() assert get_pseudo_id(p._related[0].organization_id) == { 'name': 'Academy of Motion Picture Arts and Sciences'} assert p._related[0].person_id == p._id assert p._related[0].role == 'winner' assert p._related[0].start_date == '2016'
def test_legislator_related_party(): leg = Person('John Adams', party='Democratic-Republican') leg.pre_save('jurisdiction-id') # a party membership assert len(leg._related) == 1 assert leg._related[0].person_id == leg._id assert get_pseudo_id(leg._related[0].organization_id) == {'classification': 'party', 'name': 'Democratic-Republican'} assert leg._related[0].role == 'member'
def test_person_add_membership(): p = Person('Bob B. Bear') p.add_source('http://example.com') o = Organization('test org') p.add_membership(o, 'member', start_date='2007') assert len(p._related) == 1 p._related[0].validate() assert p._related[0].person_id == p._id assert p._related[0].organization_id == o._id assert p._related[0].start_date == '2007'
def parse_senate(self, div, chamber): name = div.xpath('.//h3/text()')[0] if name.endswith(' (R)'): party = 'Republican' elif name.endswith(' (D)'): party = 'Democratic' else: self.warning('skipping ' + name) return None name = name.split(' (')[0] district = div.xpath( './/div[contains(@class, "senator-district")]/div/text()' )[0].strip().lstrip('0') photo_url = div.xpath('.//img/@src')[0] person = Person( name=name, party=party, district=district, primary_org=chamber, image=photo_url, ) url = div.xpath('.//a/@href')[0] person.add_link(url) # CA senators have working emails, but they're not putting them on # their public pages anymore email = self._construct_email(chamber, name) person.add_contact_detail(type='email', value=email, note='Senate Office') office_path = './/div[contains(@class, "{}")]//p' for addr in div.xpath(office_path.format('views-field-field-senator-capitol-office')): note = 'Senate Office' addr, phone = addr.text_content().split('; ') person.add_contact_detail(type='address', value=addr.strip(), note=note) person.add_contact_detail(type='voice', value=phone.strip(), note=note) n = 1 for addr in div.xpath(office_path.format('views-field-field-senator-district-office')): note = 'District Office #{}'.format(n) for addr in addr.text_content().strip().splitlines(): try: addr, phone = addr.strip().replace(u'\xa0', ' ').split('; ') person.add_contact_detail(type='address', value=addr.strip(), note=note) person.add_contact_detail(type='voice', value=phone.strip(), note=note) except ValueError: addr = addr.strip().replace(u'\xa0', ' ') person.add_contact_detail(type='address', value=addr.strip(), note=note) n += 1 return person