def handle_list_item(self, row): if not row['First Name']: return # This Dan Schoen hack very probably can be removed after April 2018. if row['First Name'] == 'Dan' and row['Last Name'] == 'Schoen': self.danSchoenSeen = True return name = '{} {}'.format(row['First Name'], row['Last Name']) party = PARTIES[row['Party']] leg = Person(name=name, district=row['District'].lstrip('0'), party=party, primary_org='upper', role='Senator', image=self.extra_info[name]['image']) leg.add_link(self.extra_info[name]['url']) leg.add_contact_detail(type='voice', value=self.extra_info[name]['office_phone'], note='capitol') if 'email' in self.extra_info[name]: leg.add_contact_detail(type='email', value=self.extra_info[name]['email'], note='capitol') row['Zipcode'] = row['Zipcode'].strip() # Accommodate for multiple address column naming conventions. address1_fields = [row.get('Address'), row.get('Office Building')] address2_fields = [row.get('Address2'), row.get('Office Address')] row['Address'] = next((a for a in address1_fields if a is not None), False) row['Address2'] = next((a for a in address2_fields if a is not None), False) if (a in row['Address2'] for a in ['95 University Avenue W', '100 Rev. Dr. Martin Luther King']): address = ( '{Address}\n{Address2}\n{City}, {State} {Zipcode}'.format( **row)) if 'Rm. Number' in row: address = '{0} {1}'.format(row['Rm. Number'], address) leg.add_contact_detail(type='address', value=address, note='capitol') elif row['Address2']: address = ( '{Address}\n{Address2}\n{City}, {State} {Zipcode}'.format( **row)) leg.add_contact_detail(type='address', value=address, note='district') else: address = '{Address}\n{City}, {State} {Zipcode}'.format(**row) leg.add_contact_detail(type='address', value=address, note='district') leg.add_source(self.url) leg.add_source(self._html_url) return leg
def scrape_senator_page(self, chamber, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) for legislator in page.xpath( "//div[@id='senators']//div[contains(concat(' ', normalize-space(@class), ' '), " "' portraitContainer ')]"): img = legislator.xpath( ".//div[@class='profileThumbnailBoundingBox']/@style")[0] img = img[img.find("(") + 1:img.find(")")] full_name = legislator.xpath( ".//div[@class='profileName']/a/text()")[0] homepage_url = legislator.xpath( ".//a[@class='profileImageLink']")[0].attrib["href"] district = legislator.xpath(".//div[@class='profileDistrict']" "/a/text()")[0].split("#")[1] if "Vacant" in full_name: continue homepage = self.get(homepage_url).text page = lxml.html.fromstring(homepage) phone = page.xpath("//div[@class='phone']/span/text()")[0] address_lines = page.xpath("//div[@class='address']/span/text()") address = "\n".join(address_lines) party_image = page.xpath( '//div[@class="senatorParty"]/img/@src')[0] if "Republican" in party_image: party = "Republican" elif "Democrat" in party_image: party = "Democratic" email = ("rep{0:0{width}}@ohiohouse.gov" if chamber == "lower" else "sd{0:0{width}}@ohiosenate.gov").format(int(district), width=2) leg = Person( name=full_name, district=district, primary_org=chamber, image=img, party=party, ) leg.add_contact_detail(type="address", value=address, note="Capitol Office") leg.add_contact_detail(type="voice", value=phone, note="Capitol Office") leg.add_contact_detail(type="email", value=email, note="Capitol Office") leg.add_source(url) leg.add_link(homepage_url) yield leg
def scrape_page(self, chamber, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) for legislator in page.xpath( "//div[contains(concat(' ', normalize-space(@class), ' '), " "' memberModule ')]"): img = legislator.xpath( ".//div[@class='thumbnail']//img")[0].attrib['src'] data = legislator.xpath(".//div[@class='data']")[0] homepage = data.xpath(".//a[@class='black']")[0] full_name = homepage.text_content() if "Vacant" in full_name: continue homepage = homepage.attrib['href'] party = data.xpath( ".//span[@class='partyLetter']")[0].text_content() party = {"R": "Republican", "D": "Democratic"}[party] office_lines = data.xpath("child::text()") phone = office_lines.pop(-1) office = "\n".join(office_lines) h3 = data.xpath("./h3") if len(h3): h3 = h3[0] district = h3.xpath("./br")[0].tail.replace("District", "").strip() else: district = re.findall("\d+\.png", legislator.attrib['style'])[-1].split( ".", 1)[0] full_name = re.sub("\s+", " ", full_name).strip() email = ('rep{0:0{width}}@ohiohouse.gov' if chamber == 'lower' else 'sd{0:0{width}}@ohiosenate.gov').format(int(district), width=2) leg = Person(name=full_name, district=district, party=party, primary_org=chamber, image=img) leg.add_contact_detail(type='address', value=office, note='Capitol Office') leg.add_contact_detail(type='voice', value=phone, note='Capitol Office') leg.add_contact_detail(type='email', value=email, note='Capitol Office') self.scrape_homepage(leg, chamber, homepage) leg.add_source(url) leg.add_link(homepage) yield leg
def scrape_legislator(self, chamber, name, url): html = self.get(url).text page = lxml.html.fromstring(html) page.make_links_absolute(url) district = page.xpath('//h1[contains(., "DISTRICT")]/text()').pop() \ .split()[1].strip().lstrip('0') party = page.xpath('//h2').pop().text_content() party = re.search(r'\((R|D|I)[ \-\]]', party).group(1) if party == 'D': party = 'Democratic' elif party == 'R': party = 'Republican' elif party == 'I': party = 'Independent' photo_url = page.xpath( "//img[contains(@src, 'images/members/')]")[0].attrib['src'] leg = Person(name, district=district, party=party, image=photo_url, primary_org=chamber) leg.add_link(url) leg.add_source(url) self.scrape_offices(leg, page) yield leg
def scrape_chamber(self, session): session_key = SESSION_KEYS[session] legislators_reponse = self.api_client.get('legislators', session=session_key) for legislator in legislators_reponse: url_name = legislator['WebSiteUrl'].split('/')[-1] chamber_name = 'house' if legislator['Chamber'] == 'H' else 'senate' img = 'https://www.oregonlegislature.gov/{}/MemberPhotos/{}.jpg'.format( chamber_name, url_name ) party = legislator['Party'] if party == 'Democrat': party = 'Democratic' person = Person(name='{} {}'.format(legislator['FirstName'], legislator['LastName']), primary_org={'S': 'upper', 'H': 'lower'}[legislator['Chamber']], party=party, district=legislator['DistrictNumber'], image=img) person.add_link(legislator['WebSiteUrl']) person.add_source(legislator['WebSiteUrl']) if legislator['CapitolAddress']: person.add_contact_detail(type='address', value=legislator['CapitolAddress'], note='Capitol Office') if legislator['CapitolPhone']: person.add_contact_detail(type='voice', value=legislator['CapitolPhone'], note='Capitol Office') person.add_contact_detail(type='email', value=legislator['EmailAddress'], note='Capitol Office') yield person
def scrape_member(self, chamber, member_url): member_page = self.get(member_url).text doc = lxml.html.fromstring(member_page) photo_url = doc.xpath('//div[@id="bioImage"]/img/@src')[0] name_pieces = doc.xpath('//span[@id="name"]/text()')[0].split() full_name = ' '.join(name_pieces[1:-1]).strip() party = name_pieces[-1] if party == '(R)': party = 'Republican' elif party == '(D)': party = 'Democratic' elif party == '(I)': party = 'Independent' district = doc.xpath('//span[@id="districtHeader"]/text()')[0].split()[-1] person = Person(name=full_name, district=district, party=party, primary_org=chamber, image=photo_url) person.add_source(member_url) person.add_link(member_url) address = '\n'.join(doc.xpath('//div[@id="FrankfortAddresses"]//' 'span[@class="bioText"]/text()')) phone = None fax = None phone_numbers = doc.xpath('//div[@id="PhoneNumbers"]//span[@class="bioText"]/text()') for num in phone_numbers: if num.startswith('Annex: '): num = num.replace('Annex: ', '') if num.endswith(' (fax)'): fax = num.replace(' (fax)', '') else: phone = num emails = doc.xpath( '//div[@id="EmailAddresses"]//span[@class="bioText"]//a/text()' ) email = reduce( lambda match, address: address if '@lrc.ky.gov' in str(address) else match, [None] + emails ) if phone: person.add_contact_detail(type='voice', value=phone, note='Capitol Office') if fax: person.add_contact_detail(type='fax', value=fax, note='Capitol Office') if email: person.add_contact_detail(type='email', value=email, note='Capitol Office') if address.strip() == "": self.warning("Missing Capitol Office!!") else: person.add_contact_detail(type='address', value=address, note='Capitol Office') yield person
def scrape_upper_chamber(self, term): url = "http://oksenate.gov/Senators/Default.aspx" html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) for a in doc.xpath('//table[@summary]')[0]. \ xpath('.//td//a[contains(@href, "biographies")]'): tail = a.xpath('..')[0].tail if tail: district = tail.split()[1] else: district = a.xpath('../../span')[1].text.split()[1] if a.text is None or a.text.strip() == 'Vacant': self.warning( "District {} appears to be empty".format(district)) continue else: match = re.match(r'(.+) \(([A-Z])\)', a.text.strip()) name, party = match.group(1), self._parties[match.group(2)] url = a.get('href') person = Person( primary_org='upper', district=district, name=name.strip(), party=party, ) person.add_link(url) person.add_source(url) self.scrape_upper_offices(person, url) yield person
def scrape_member(self, chamber, link): name = link.text.strip() leg_url = link.get('href') district = link.xpath("string(../../td[3])") party = link.xpath("string(../../td[4])") # we get email on the next page now # email = link.xpath("string(../../td[5])") if party == 'Democrat': party = 'Democratic' elif party == 'No Party Specified': party = 'Independent' pid = re.search(r"personID=(\d+)", link.attrib['href']).group(1) photo_url = ("https://www.legis.iowa.gov/photo" "?action=getPhoto&ga=%s&pid=%s" % (self.latest_session(), pid)) leg = Person( name=name, primary_org=chamber, district=district, party=party, image=photo_url) leg.add_link(leg_url) leg.add_source(leg_url) leg_page = lxml.html.fromstring(self.get(link.attrib['href']).text) self.scrape_member_page(leg, leg_page) yield leg
def handle_list_item(self, item): link = item.xpath('.//div[@class="rep_style"]/a')[0] name = link.text_content().strip() if 'Vacant' in name or 'Resigned' in name or 'Pending' in name: return party = item.xpath('.//div[@class="party_style"]/text()')[0].strip() party = {'D': 'Democratic', 'R': 'Republican'}[party] district = item.xpath( './/div[@class="district_style"]/text()')[0].strip() leg_url = link.get('href') split_url = parse.urlsplit(leg_url) member_id = parse.parse_qs(split_url.query)['MemberId'][0] image = "http://www.flhouse.gov/FileStores/Web/Imaging/Member/{}.jpg".format( member_id) rep = Person(name=name, district=district, party=party, primary_org='lower', role='Representative', image=image) rep.add_link(leg_url) rep.add_source(leg_url) rep.add_source(self.url) self.scrape_page(RepDetail, leg_url, obj=rep) return rep
def handle_list_item(self, item): name = " ".join(item.xpath('.//text()')) name = re.sub(r'\s+', " ", name).replace(" ,", ",").strip() if 'Vacant' in name: return district = item.xpath("string(../../td[1])") party = item.xpath("string(../../td[2])") if party == 'Democrat': party = 'Democratic' leg_url = item.get('href') leg = Person(name=name, district=district, party=party, primary_org='upper', role='Senator') leg.add_link(leg_url) leg.add_source(self.url) leg.add_source(leg_url) self.scrape_page(SenDetail, leg_url, obj=leg) return leg
def handle_list_item(self, item): link = item.xpath('.//div[@class="rep_style"]/a')[0] name = link.text_content().strip() if 'Vacant' in name or 'Resigned' in name or 'Pending' in name: return party = item.xpath('.//div[@class="party_style"]/text()')[0].strip() party = {'D': 'Democratic', 'R': 'Republican'}[party] district = item.xpath('.//div[@class="district_style"]/text()')[0].strip() leg_url = link.get('href') split_url = parse.urlsplit(leg_url) member_id = parse.parse_qs(split_url.query)['MemberId'][0] image = "http://www.flhouse.gov/FileStores/Web/Imaging/Member/{}.jpg".format(member_id) rep = Person(name=name, district=district, party=party, primary_org='lower', role='Representative', image=image) rep.add_link(leg_url) rep.add_source(leg_url) rep.add_source(self.url) self.scrape_page(RepDetail, leg_url, obj=rep) return rep
def test_full_person(): person = ScrapePerson('Tom Sawyer') person.add_identifier('1') person.add_name('Tommy', start_date='1880') person.add_contact_detail(type='phone', value='555-555-1234', note='this is fake') person.add_link('http://example.com/link') person.add_source('http://example.com/source') # import person pd = person.as_dict() PersonImporter('jurisdiction-id').import_data([pd]) # get person from db and assert it imported correctly p = Person.objects.get() assert 'ocd-person' in p.id assert p.name == person.name assert p.identifiers.all()[0].identifier == '1' assert p.identifiers.all()[0].scheme == '' assert p.other_names.all()[0].name == 'Tommy' assert p.other_names.all()[0].start_date == '1880' assert p.contact_details.all()[0].type == 'phone' assert p.contact_details.all()[0].value == '555-555-1234' assert p.contact_details.all()[0].note == 'this is fake' assert p.links.all()[0].url == 'http://example.com/link' assert p.sources.all()[0].url == 'http://example.com/source'
def handle_list_item(self, item): photo_url = item.xpath('./img/@src')[0] url = item.xpath('.//h5/a/@href')[0] name_text = item.xpath('.//h5/a/b/text()')[0] name_match = re.match(r'^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$', name_text) name = name_match.group(1).strip() district = name_match.group(2).lstrip('0').upper() party_text = name_match.group(3) party = PARTIES[party_text] info_texts = [x.strip() for x in item.xpath( './div/text()[normalize-space()]' ) if x.strip()] address = '\n'.join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if validate_phone_number(phone_text): phone = phone_text email_text = item.xpath('.//a/@href')[1].replace('mailto:', '').strip() if validate_email_address(email_text): email = email_text rep = Person(name=name, district=district, party=party, primary_org='lower', role='Representative', image=photo_url) rep.add_link(url) rep.add_contact_detail(type='address', value=address, note='capitol') rep.add_contact_detail(type='voice', value=phone, note='capitol') rep.add_contact_detail(type='email', value=email, note='capitol') rep.add_source(self.url) yield rep
def handle_list_item(self, item): name = " ".join(item.xpath(".//text()")) name = re.sub(r"\s+", " ", name).replace(" ,", ",").strip() if "Vacant" in name: return district = item.xpath("string(../../td[1])") party = item.xpath("string(../../td[2])") if party == "Democrat": party = "Democratic" leg_url = item.get("href") name = fix_name(name) leg = Person( name=name, district=district, party=party, primary_org="upper", role="Senator", ) leg.add_link(leg_url) leg.add_source(self.url) leg.add_source(leg_url) self.scrape_page(SenDetail, leg_url, obj=leg) return leg
def scrape(self): committee_d = {} for councilman, committees in self.councilMembers() : p = Person(' '.join((councilman['First name'], councilman['Last name']))) if p.name == 'Toni Preckwinkle' : continue elif p.name == 'Robert Steele' : district = 2 elif p.name == 'Jerry Butler' : district = 3 elif p.name == 'Sean Morrison' : district = 17 else : district = re.findall('\d+', councilman['Person Name']['url'])[0] start_date = self.toTime(councilman['Start Date']).date() end_date = self.toTime(councilman['End Date']).date() if end_date == datetime.date(2018, 12, 2) : end_date = '' else : end_date = end_date.isoformat() p.add_term('Commissioner', 'legislature', district='District {}'.format(district), start_date=start_date.isoformat(), end_date=end_date) if councilman["E-mail"]: p.add_contact_detail(type="email", value=councilman['E-mail']['url'], note='E-mail') if councilman['Web site']: p.add_link(councilman['Web site']['url'], note='web site') p.add_source(councilman['Person Name']['url']) for committee, _, _ in committees: committee_name = committee['Department Name']['label'] if 'committee' in committee_name.lower() : o = committee_d.get(committee_name, None) if o is None: o = Organization(committee_name, classification='committee', parent_id={'name' : 'Cook County Board of Commissioners'}) o.add_source(committee['Department Name']['url']) committee_d[committee_name] = o membership = o.add_member(p, role=committee["Title"]) membership.start_date = self.mdY2Ymd(committee["Start Date"]) yield p for o in committee_d.values() : yield o
def scrape_member(self, chamber, link): name = link.text.strip() leg_url = link.get("href") district = link.xpath("string(../../td[3])") party = link.xpath("string(../../td[4])") # we get email on the next page now # email = link.xpath("string(../../td[5])") if party == "Democrat": party = "Democratic" elif party == "No Party Specified": party = "Independent" pid = re.search(r"personID=(\d+)", link.attrib["href"]).group(1) photo_url = ("https://www.legis.iowa.gov/photo" "?action=getPhoto&ga=%s&pid=%s" % (self.latest_session(), pid)) leg = Person( name=name, primary_org=chamber, district=district, party=party, image=photo_url, ) leg.add_link(leg_url) leg.add_source(leg_url) leg_page = lxml.html.fromstring(self.get(link.attrib["href"]).text) self.scrape_member_page(leg, leg_page) yield leg
def scrape_upper_chamber(self, term): url = "http://oksenate.gov/Senators/Default.aspx" html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) for a in doc.xpath('//table[@summary]')[0]. \ xpath('.//td//a[contains(@href, "biographies")]'): tail = a.xpath('..')[0].tail if tail: district = tail.split()[1] else: district = a.xpath('../../span')[1].text.split()[1] if a.text is None or a.text.strip() == 'Vacant': self.warning("District {} appears to be empty".format(district)) continue else: match = re.match(r'(.+) \(([A-Z])\)', a.text.strip()) name, party = match.group(1), self._parties[match.group(2)] url = a.get('href') person = Person(primary_org='upper', district=district, name=name.strip(), party=party, ) person.add_link(url) person.add_source(url) self.scrape_upper_offices(person, url) yield person
def bos_scrape_people(self): page = self.lxmlize(MEMBER_LIST) people = page.xpath( "//table[@width='100%']//td[@style='TEXT-ALIGN: center']") for person in people: image, name = [self.get_one(person, x) for x in [ ".//img", ".//a[contains(@href, 'councillors') and (text()!='')]" ]] role = person.xpath(".//br")[0].tail.strip() image = image.attrib['src'] # Fallback if we don't get one from the # homepage. homepage = name.attrib['href'] name = clean_name(name.text) info = self.scrape_homepage(homepage) if info.get('image', None): image = info['image'] p = Person(name=name, district=role, image=image, primary_org="legislature", biography=info['bio']) p.add_link(url=homepage, note='homepage') p.add_source(homepage) p.add_source(MEMBER_LIST) yield p
def scrape_upper_chamber(self, term): url = 'https://senado.pr.gov/Pages/Senadores.aspx' doc = self.lxmlize(url) links = self.get_nodes(doc, '//ul[@class="senadores-list"]/li/a/@href') for link in links: senator_page = self.lxmlize(link) profile_links = self.get_nodes(senator_page, '//ul[@class="profiles-links"]/li') name_text = self.get_node( senator_page, '//span[@class="name"]').text_content().strip() # Convert to title case as some names are in all-caps name = re.sub(r'^Hon\.', '', name_text, flags=re.IGNORECASE).strip().title() party = profile_links[0].text_content().strip() # Translate to English since being an Independent is a universal construct if party == "Independiente": party = "Independent" photo_url = self.get_node(senator_page, '//div[@class="avatar"]//img/@src') if profile_links[1].text_content().strip( ) == "Senador por Distrito": district_text = self.get_node( senator_page, '//div[@class="module-distrito"]//span[@class="headline"]' ).text_content() district = district_text.replace('DISTRITO', '', 1).replace('\u200b', '').strip() elif profile_links[1].text_content().strip( ) == "Senador por Acumulación": district = "At-Large" phone_node = self.get_node(senator_page, '//a[@class="contact-data tel"]') phone = phone_node.text_content().strip() email_node = self.get_node(senator_page, '//a[@class="contact-data email"]') email = email_node.text_content().replace('\u200b', '').strip() person = Person(primary_org='upper', district=district, name=name, party=party, image=photo_url) person.add_contact_detail(type='email', value=email, note='Capitol Office') person.add_contact_detail(type='voice', value=phone, note='Capitol Office') person.add_link(link) person.add_source(link) yield person
def scrape_chamber(self, chamber): self._party_map = { 'Democrat': 'Democratic', 'Republican': 'Republican', 'Non Affiliated': 'Independent', 'Not Affiliated': 'Independent', } if chamber == 'upper': url = 'http://senate.legis.state.ak.us/' else: url = 'http://house.legis.state.ak.us/' page = self.lxmlize(url) items = page.xpath('//ul[@class="item"]')[1].getchildren() for item in items: photo_url = item.xpath('.//img/@src')[0] name = item.xpath('.//strong/text()')[0] leg_url = item.xpath('.//a/@href')[0] email = item.xpath('.//a[text()="Email Me"]/@href') if email: email = email[0].replace('mailto:', '') else: self.warning('no email for ' + name) party = district = None skip = False for dt in item.xpath('.//dt'): dd = dt.xpath('following-sibling::dd')[0].text_content() label = dt.text.strip() if label == 'Party:': party = dd elif label == 'District:': district = dd elif label.startswith('Deceased'): skip = True self.warning('skipping deceased ' + name) break if skip: continue person = Person( primary_org=chamber, district=district, name=name, party=self._party_map[party], image=photo_url, ) person.add_source(leg_url) person.add_link(leg_url) # scrape offices self._scrape_offices(person, leg_url, email) yield person
def scrape_chamber(self, chamber, session): if chamber == 'upper': chamber_slug = 'Senate' elif chamber == 'lower': chamber_slug = 'Assembly' session_slug = self.jurisdiction.session_slugs[session] leg_base_url = 'http://www.leg.state.nv.us/App/Legislator/A/%s/%s/' % (chamber_slug, session_slug) leg_json_url = ('http://www.leg.state.nv.us/App/Legislator/A/api/%s/Legislator?house=%s' % (session_slug, chamber_slug)) resp = json.loads(self.get(leg_json_url).text) for item in resp: # empty district empty_names = ['District No', 'Vacant'] if any(name in item['FullName'] for name in empty_names): continue last, first = item['FullName'].split(",", 1) item['FullName'] = "{first} {last}".format(last=last, first=first).strip() person = Person(name=item['FullName'], district=item['DistrictNbr'], party=item['Party'], primary_org=chamber, image=item['PhotoURL']) leg_url = leg_base_url + item['DistrictNbr'] # hack to get the legislator ID html = self.get(leg_url).text for l in html.split('\n'): if 'GetLegislatorDetails' in l: leg_id = l.split(',')[1].split("'")[1] # fetch the json used by the page leg_details_url = ('https://www.leg.state.nv.us/App/Legislator/A/api/{}/Legislator?id=' .format(session_slug) + leg_id) leg_resp = json.loads(self.get(leg_details_url).text) details = leg_resp['legislatorDetails'] address = details['Address1'] address2 = details['Address2'] if address2: address += ' ' + address2 address += '\n%s, NV %s' % (details['City'], details['Zip']) phone = details['LCBPhone'] email = details['LCBEmail'] if address: person.add_contact_detail(type='address', value=address, note='District Office') if phone: person.add_contact_detail(type='voice', value=phone, note='District Office') if phone: person.add_contact_detail(type='email', value=email, note='District Office') person.add_link(leg_details_url) person.add_source(leg_details_url) yield person
def test_invalid_fields_related_item(): p1 = ScrapePerson('Dwayne') p1.add_link('http://example.com') p1 = p1.as_dict() p1['links'][0]['test'] = 3 with pytest.raises(DataImportError): PersonImporter('jid').import_data([p1])
def scrape_chamber(self, chamber): """ Scrapes legislators for the current term only """ # self.validate_term(term, latest_only=True) url = BASE_URL % CHAMBERS[chamber].lower() index = self.get(url).text html = lxml.html.fromstring(index) html.make_links_absolute(url) rows = html.xpath('//div[contains(@class, "row-equal-height")]') for row in rows: img_url = row.xpath('.//img/@src')[0] inner = row.xpath('.//div[@class="vc-column-innner-wrapper"]')[1] inner_text = inner.text_content() if 'Resigned' in inner_text or 'Substitute' in inner_text: continue name = inner.xpath('p/strong')[0].text.replace(u'\xa0', ' ').strip() name = re.sub(r'\s+', ' ', name) party = PARTY[inner.xpath('p/strong')[0].tail.strip()] email = inner.xpath('p/strong/a')[0].text district = inner.xpath('p/a')[0].text.replace('District ', '') person_url = inner.xpath('p/a/@href')[0] # skip roles for now role = '' # for com in inner.xpath('p/a[contains(@href, "committees")]'): # role = com.tail.strip() person = Person(name=name, district=district, party=party, primary_org=chamber, image=img_url, role=role) phones = get_phones(inner) phone = phones.get('home') or phones.get('business') office_phone = phones.get('office') address = get_address(inner) fax = get_fax(inner) if address: person.add_contact_detail(type='address', value=address, note='District Office') if phone: person.add_contact_detail(type='voice', value=phone, note='District Office') if fax: person.add_contact_detail(type='fax', value=fax, note='District Office') if email: person.add_contact_detail(type='email', value=email, note='District Office') if office_phone: person.add_contact_detail(type='voice', value=office_phone, note='Capitol Office') person.add_source(url) person.add_link(person_url) yield person
def scrape_chamber(self, chamber): client = ApiClient(self) session = self.latest_session() base_url = "http://iga.in.gov/legislative" api_base_url = "https://api.iga.in.gov" chamber_name = "senate" if chamber == "upper" else "house" r = client.get("chamber_legislators", session=session, chamber=chamber_name) all_pages = client.unpaginate(r) for leg in all_pages: firstname = leg["firstName"] lastname = leg["lastName"] party = leg["party"] link = leg["link"] api_link = api_base_url + link html_link = base_url + link.replace("legislators/", "legislators/legislator_") try: html = get_with_increasing_timeout(self, html_link, fail=True, kwargs={"verify": False}) except scrapelib.HTTPError: self.logger.warning("Legislator's page is not available.") continue doc = lxml.html.fromstring(html.text) doc.make_links_absolute(html_link) address, phone = doc.xpath("//address") address = address.text_content().strip() address = "\n".join([l.strip() for l in address.split("\n")]) phone = phone.text_content().strip() try: district = (doc.xpath("//span[@class='district-heading']") [0].text.lower().replace("district", "").strip()) except IndexError: self.warning("skipping legislator w/o district") continue image_link = base_url + link.replace("legislators/", "portraits/legislator_") legislator = Person( primary_org=chamber, district=district, name=" ".join([firstname, lastname]), party=party, image=image_link, ) legislator.add_contact_detail(type="address", note="Capitol Office", value=address) legislator.add_contact_detail(type="voice", note="Capitol Office", value=phone) legislator.add_link(html_link) legislator.add_source(html_link) legislator.add_source(api_link) yield legislator
def scrape_chamber(self, chamber): """ Scrapes legislators for the current term only """ # self.validate_term(term, latest_only=True) url = BASE_URL % CHAMBERS[chamber].lower() index = self.get(url, verify=False).text html = lxml.html.fromstring(index) html.make_links_absolute(url) rows = html.xpath('//div[contains(@class, "row-equal-height")]') for row in rows: img_url = row.xpath('.//img/@src')[0] inner = row.xpath('.//div[@class="vc-column-innner-wrapper"]')[1] inner_text = inner.text_content() if 'Resigned' in inner_text or 'Substitute' in inner_text: continue name = inner.xpath('p/strong')[0].text.replace(u'\xa0', ' ').strip() name = re.sub('\s+', ' ', name) party = PARTY[inner.xpath('p/strong')[0].tail.strip()] email = inner.xpath('p/strong/a')[0].text district = inner.xpath('p/a')[0].text.replace('District ', '') person_url = inner.xpath('p/a/@href')[0] # skip roles for now role = '' # for com in inner.xpath('p/a[contains(@href, "committees")]'): # role = com.tail.strip() person = Person(name=name, district=district, party=party, primary_org=chamber, image=img_url, role=role) phones = get_phones(inner) phone = phones.get('home') or phones.get('business') office_phone = phones.get('office') address = get_address(inner) fax = get_fax(inner) if address: person.add_contact_detail(type='address', value=address, note='District Office') if phone: person.add_contact_detail(type='voice', value=phone, note='District Office') if fax: person.add_contact_detail(type='fax', value=fax, note='District Office') if email: person.add_contact_detail(type='email', value=email, note='District Office') if office_phone: person.add_contact_detail(type='voice', value=office_phone, note='Capitol Office') person.add_source(url) person.add_link(person_url) yield person
def scrape_lower(self, chamber): url = 'http://www.house.mi.gov/mhrpublic/frmRepList.aspx' table = [ "website", "district", "name", "party", "location", "phone", "email" ] data = self.get(url).text doc = lxml.html.fromstring(data) # skip two rows at top for row in doc.xpath('//table[@id="grvRepInfo"]/*'): tds = row.xpath('.//td') if len(tds) == 0: continue metainf = {} for i in range(0, len(table)): metainf[table[i]] = tds[i] district = str(int(metainf['district'].text_content().strip())) party = metainf['party'].text_content().strip() phone = metainf['phone'].text_content().strip() email = metainf['email'].text_content().strip() leg_url = metainf['website'].xpath("./a")[0].attrib['href'] name = metainf['name'].text_content().strip() if name == 'Vacant' or re.match(r'^District \d{1,3}$', name): self.warning('District {} appears vacant, and will be skipped'.format(district)) continue office = metainf['location'].text_content().strip() office = re.sub( ' HOB', ' Anderson House Office Building\n124 North Capitol Avenue\nLansing, MI 48933', office ) office = re.sub( ' CB', ' State Capitol Building\nLansing, MI 48909', office ) photo_url = self.get_photo_url(leg_url) person = Person(name=name, district=district, party=abbr[party], primary_org='lower', image=photo_url[0] if photo_url else None) person.add_link(leg_url) person.add_source(leg_url) person.add_contact_detail(type='address', value=office, note='Capitol Office') person.add_contact_detail(type='voice', value=phone, note='Capitol Office') person.add_contact_detail(type='email', value=email, note='Capitol Office') yield person
def get_member(self, session, chamber, kpid): url = "%smembers/%s" % (ksapi.url, kpid) content = json.loads(self.get(url).text)["content"] party = content["PARTY"] if party == "Democrat": party = "Democratic" slug = { "2013-2014": "b2013_14", "2015-2016": "b2015_16", "2017-2018": "b2017_18", "2019-2020": "b2019_20", }[session] leg_url = "http://www.kslegislature.org/li/%s/members/%s/" % (slug, kpid) try: legislator_page = self.lxmlize(leg_url) (photo_url, ) = legislator_page.xpath('//img[@class="profile-picture"]/@src') except scrapelib.HTTPError: self.warning("{}'s legislator bio page not found".format( content["FULLNAME"])) leg_url = "" photo_url = "" person = Person( name=content["FULLNAME"], district=str(content["DISTRICT"]), primary_org=chamber, party=party, image=photo_url, ) person.extras = {"occupation": content["OCCUPATION"]} address = "\n".join([ "Room {}".format(content["OFFICENUM"]), "Kansas State Capitol Building", "300 SW 10th St.", "Topeka, KS 66612", ]) note = "Capitol Office" person.add_contact_detail(type="address", value=address, note=note) person.add_contact_detail(type="email", value=content["EMAIL"], note=note) if content["OFFPH"]: person.add_contact_detail(type="voice", value=content["OFFPH"], note=note) person.add_source(url) person.add_link(leg_url) yield person
def _scrape_legislator(self, row, chamber): name_cell = row.xpath('./td[@class="rosterCell nameCell"]/a')[0] name = " ".join([ line.strip() for line in name_cell.text_content().split("\n") if len(line.strip()) > 0 ]) party_letter = row.xpath( './td[@class="rosterCell partyCell"]/text()')[0].strip() party = dict(D="Democratic", R="Republican")[party_letter] chamber_abbr = self._chamber_map[chamber] district = (row.xpath('./td[@class="rosterCell seatCell"]' "/text()")[0].replace(chamber_abbr, "").strip()) try: email = (row.xpath('./td[@class="rosterCell emailCell"]' "/a/@href")[0].replace("mailto:", "").strip()) except IndexError: email = None phone = (row.xpath('./td[@class="rosterCell phoneCell"]' "/text()")[0].strip() or None) details_url = "https://leg.mt.gov{}".format(name_cell.attrib["href"]) response = self.get(details_url) details_page = lxml.html.fromstring(response.text) address_lines = (details_page.xpath( '//div[@class="col-lg-6 col-md-12 text-lg-left align-self-center"]' '/p[contains(text(), "Address")]')[0].text_content().replace( "Address", "").split("\n")) address = "\n".join( [line.strip() for line in address_lines if len(line.strip()) > 0]) legislator = Person(name=name, district=district, party=party, primary_org=chamber) legislator.add_contact_detail(type="address", value=address, note="Capitol Office") if phone is not None: legislator.add_contact_detail(type="voice", value=phone, note="Capitol Office") if email is not None: legislator.add_contact_detail(type="email", value=email, note="E-mail") legislator.add_link(details_url) legislator.add_source(self._roster_url) yield legislator
def get_member(self, session, chamber, kpid): url = '%smembers/%s' % (ksapi.url, kpid) content = json.loads(self.get(url).text)['content'] party = content['PARTY'] if party == 'Democrat': party = 'Democratic' slug = { '2013-2014': 'b2013_14', '2015-2016': 'b2015_16', '2017-2018': 'b2017_18' }[session] leg_url = 'http://www.kslegislature.org/li/%s/members/%s/' % (slug, kpid) try: legislator_page = self.lxmlize(leg_url) photo_url, = legislator_page.xpath( '//img[@class="profile-picture"]/@src') except scrapelib.HTTPError: self.warning("{}'s legislator bio page not found".format( content['FULLNAME'])) leg_url = '' photo_url = '' person = Person( name=content['FULLNAME'], district=str(content['DISTRICT']), primary_org=chamber, party=party, image=photo_url, ) person.extras = {'occupation': content['OCCUPATION']} address = '\n'.join([ 'Room {}'.format(content['OFFICENUM']), 'Kansas State Capitol Building', '300 SW 10th St.', 'Topeka, KS 66612', ]) note = 'Capitol Office' person.add_contact_detail(type='address', value=address, note=note) person.add_contact_detail(type='email', value=content['EMAIL'], note=note) if content['OFFPH']: person.add_contact_detail(type='voice', value=content['OFFPH'], note=note) person.add_source(url) person.add_link(leg_url) yield person
def scrape_member_page(self, chamber, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) for legislator in page.xpath( "//div[contains(concat(' ', normalize-space(@class), ' '), " "' memberModule ')]" ): img = legislator.xpath( ".//div[@class='thumbnail']//img")[0].attrib['src'] data = legislator.xpath(".//div[@class='data']")[0] homepage = data.xpath(".//a[@class='black']")[0] full_name = homepage.text_content() if "Vacant" in full_name: continue homepage = homepage.attrib['href'] party = data.xpath( ".//span[@class='partyLetter']")[0].text_content() party = {"R": "Republican", "D": "Democratic"}[party] office_lines = data.xpath("child::text()") phone = office_lines.pop(-1) office = "\n".join(office_lines) h3 = data.xpath("./h3") if len(h3): h3 = h3[0] district = h3.xpath("./br")[0].tail.replace("District", "" ).strip() else: district = re.findall( r"\d+\.png", legislator.attrib['style'] )[-1].split(".", 1)[0] full_name = re.sub(r"\s+", " ", full_name).strip() email = ( 'rep{0:0{width}}@ohiohouse.gov' if chamber == 'lower' else 'sd{0:0{width}}@ohiosenate.gov' ).format(int(district), width=2) leg = Person(name=full_name, district=district, party=party, primary_org=chamber, image=img) leg.add_contact_detail(type='address', value=office, note='Capitol Office') leg.add_contact_detail(type='voice', value=phone, note='Capitol Office') leg.add_contact_detail(type='email', value=email, note='Capitol Office') self.scrape_homepage(leg, chamber, homepage) leg.add_source(url) leg.add_link(homepage) yield leg
def parse_senate(self, div, chamber): name = div.xpath('.//h3/text()')[0] if name.endswith(' (R)'): party = 'Republican' elif name.endswith(' (D)'): party = 'Democratic' else: self.warning('skipping ' + name) return None name = name.split(' (')[0] district = div.xpath( './/div[contains(@class, "senator-district")]/div/text()' )[0].strip().lstrip('0') photo_url = div.xpath('.//img/@src')[0] person = Person( name=name, party=party, district=district, primary_org=chamber, image=photo_url, ) url = div.xpath('.//a/@href')[0] person.add_link(url) # CA senators have working emails, but they're not putting them on # their public pages anymore email = self._construct_email(chamber, name) person.add_contact_detail(type='email', value=email, note='Senate Office') office_path = './/div[contains(@class, "{}")]//p' for addr in div.xpath(office_path.format('views-field-field-senator-capitol-office')): note = 'Senate Office' addr, phone = addr.text_content().split('; ') person.add_contact_detail(type='address', value=addr.strip(), note=note) person.add_contact_detail(type='voice', value=phone.strip(), note=note) n = 1 for addr in div.xpath(office_path.format('views-field-field-senator-district-office')): note = 'District Office #{}'.format(n) for addr in addr.text_content().strip().splitlines(): try: addr, phone = addr.strip().replace(u'\xa0', ' ').split('; ') person.add_contact_detail(type='address', value=addr.strip(), note=note) person.add_contact_detail(type='voice', value=phone.strip(), note=note) except ValueError: addr = addr.strip().replace(u'\xa0', ' ') person.add_contact_detail(type='address', value=addr.strip(), note=note) n += 1 return person
def scrape_senator(self, district): link = "https://legislature.maine.gov/District-{}".format(district) page = lxml.html.fromstring(self.get(link).text) page.make_links_absolute(link) main = page.xpath('//div[@id="main"]/div[@id="content"]')[0] title = main.xpath("h1")[0].text # e.g. District 25 - State Senator Catherine Breen (D - Cumberland)... title_match = re.match( r"District (\d+) - State Senator ([^\(]+) \(([DRI])", title) _, name, party = title_match.groups() name = re.sub(r"\s+", " ", name.strip()) party = _party_map[party] image_url = address = phone = email = None for p in main.xpath("p"): if p.xpath(".//img") and not image_url: image_url = p.xpath(".//img/@src")[0] continue field, _, value = p.text_content().partition(":") value = value.strip() if field in ("Address", "Mailing Address"): address = value elif field in ("Phone", "Home Phone"): phone = value elif field == "Email": email = value person = Person( name=name, district=district, image=image_url, primary_org="upper", party=party, ) person.add_link(link) person.add_source(link) if address: person.add_contact_detail(type="address", value=address, note="District Office") if phone: person.add_contact_detail(type="voice", value=clean_phone(phone), note="District Phone") person.add_contact_detail(type="email", value=email, note="District Email") yield person
def _scrape_legislator(self, row, chamber): name_cell = row.xpath('./td[@class="rosterCell nameCell"]/a')[0] name = ' '.join([line.strip() for line in name_cell.text_content().split('\n') if len(line.strip()) > 0]) party_letter = row.xpath( './td[@class="rosterCell partyCell"]/text()')[0].strip() party = dict(D='Democratic', R='Republican')[party_letter] chamber_abbr = self._chamber_map[chamber] district = row.xpath('./td[@class="rosterCell seatCell"]' '/text()')[0].replace(chamber_abbr, '').strip() try: email = row.xpath('./td[@class="rosterCell emailCell"]' '/a/@href')[0].replace('mailto:', '').strip() except IndexError: email = None phone = row.xpath('./td[@class="rosterCell phoneCell"]' '/text()')[0].strip() or None details_url = 'https://leg.mt.gov{}'.format(name_cell.attrib['href']) response = self.get(details_url) details_page = lxml.html.fromstring(response.text) address_lines = details_page.xpath( '//div[@class="col-lg-6 col-md-12 text-lg-left align-self-center"]' '/p[contains(text(), "Address")]' )[0].text_content() \ .replace('Address', '') \ .split('\n') address = '\n'.join([line.strip() for line in address_lines if len(line.strip()) > 0]) legislator = Person(name=name, district=district, party=party, primary_org=chamber) legislator.add_contact_detail(type='address', value=address, note='Capitol Office') if phone is not None: legislator.add_contact_detail(type='voice', value=phone, note='Capitol Office') if email is not None: legislator.add_contact_detail(type='email', value=email, note='E-mail') legislator.add_link(details_url) legislator.add_source(self._roster_url) yield legislator
def scrape_chamber(self, chamber=None): metainf = self.scrape_leg_page(get_legislator_listing_url(chamber)) for leg in metainf: try: chamber = {"House": "lower", "Senate": "upper"}[leg['chamber']] except KeyError: print("") print(" ERROR: Bad Legislator page.") print(" -> " + "\n -> ".join(leg['source'])) print("") print(" Added this workaround because of a bad legislator") print(" page, while they filled their info out.") print("") print(" Emailed webmaster. Told to wait.") print(" - PRT, Jun 23, 2014") print("") continue person = Person(name=leg['name'], district=leg['district'], party=leg['party'], primary_org=chamber, image=leg['image']) for source in leg['source']: person.add_source( source ) try: for ctty in leg['ctty']: flag='Joint Legislative' if ctty['name'][:len(flag)] == flag: ctty_chamber = "joint" else: ctty_chamber = chamber comm = Organization(name=ctty['name'], classification="committee", chamber=ctty_chamber) comm.add_member(person,role="member") except KeyError: self.log( "XXX: Warning, %s has no scraped Committees" % leg['name'] ) person.add_link(leg['homepage']) person.add_source(leg['homepage']) if leg['addr']: person.add_contact_detail(type='address', value=leg['addr'], note='Capitol Office') if leg['phone']: person.add_contact_detail(type='voice', value=leg['phone'], note='Capitol Office') if leg['phone']: person.add_contact_detail(type='email', value=leg['phone'], note='Capitol Office') yield person
def scrape_lower_legislator(self, url, leg_info): page = self.lxmlize(url) name = page.xpath( '//span[@id="body_FormView5_FULLNAMELabel"]/text()' )[0].strip() if name.startswith("District ") or name.startswith("Vacant "): self.warning("Seat is vacant: {}".format(name)) return photo = page.xpath( '//img[contains(@src, "/h_reps/RepPics")]' )[0].attrib['src'] party_flags = { "Democrat": "Democratic", "Republican": "Republican", "Independent": "Independent" } party_info = page.xpath( '//span[@id="body_FormView5_PARTYAFFILIATIONLabel"]/text()' )[0].strip() party = party_flags[party_info] try: email = page.xpath( '//span[@id="body_FormView6_EMAILADDRESSPUBLICLabel"]/text()' )[0].strip() except IndexError: email = None district = leg_info['dist'].replace('Dist', '').strip() person = Person(name=name, party=party, district=district, primary_org='lower', image=photo) contacts = [ (leg_info["office"], "address"), (leg_info["phone"], "voice"), (email, "email"), ] for value, key in contacts: if value: person.add_contact_detail(type=key, value=value, note="District Office") person.add_source(url) person.add_link(url) yield person
def scrape_chamber(self, chamber=None): metainf = self.scrape_leg_page(get_legislator_listing_url(chamber)) for leg in metainf: try: chamber = {"House": "lower", "Senate": "upper"}[leg['chamber']] except KeyError: print("") print(" ERROR: Bad Legislator page.") print(" -> " + "\n -> ".join(leg['source'])) print("") print(" Added this workaround because of a bad legislator") print(" page, while they filled their info out.") print("") print(" Emailed webmaster. Told to wait.") print(" - PRT, Jun 23, 2014") print("") continue person = Person(name=leg['name'], district=leg['district'], party=leg['party'], primary_org=chamber, image=leg['image']) for source in leg['source']: person.add_source(source) try: for ctty in leg['ctty']: flag = 'Joint Legislative' if ctty['name'][:len(flag)] == flag: ctty_chamber = "joint" else: ctty_chamber = chamber comm = Organization(name=ctty['name'], classification="committee", chamber=ctty_chamber) comm.add_member(person, role="member") except KeyError: self.warn("%s has no scraped Committees" % leg['name']) person.add_link(leg['homepage']) if leg['addr']: person.add_contact_detail(type='address', value=leg['addr'], note='Capitol Office') if leg['phone']: person.add_contact_detail(type='voice', value=leg['phone'], note='Capitol Office') if leg['email']: person.add_contact_detail(type='email', value=leg['email'], note='Capitol Office') if leg['fax']: person.add_contact_detail(type='fax', value=leg['fax'], note='Capitol Office') yield person
def scrape_senator_page(self, chamber, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) for legislator in page.xpath( "//div[@id='senators']//div[contains(concat(' ', normalize-space(@class), ' '), " "' portraitContainer ')]"): img = legislator.xpath( ".//div[@class='profileThumbnailBoundingBox']/@style")[0] img = img[img.find('(') + 1:img.find(')')] full_name = legislator.xpath( ".//div[@class='profileName']/a/text()")[0] homepage_url = legislator.xpath( ".//a[@class='profileImageLink']")[0].attrib['href'] district = legislator.xpath(".//div[@class='profileDistrict']" "/a/text()")[0].split("#")[1] if "Vacant" in full_name: continue homepage = self.get(homepage_url).text page = lxml.html.fromstring(homepage) phone = page.xpath("//div[@class='phone']/span/text()")[0] address_lines = page.xpath("//div[@class='address']/span/text()") address = "\n".join(address_lines) email = ('rep{0:0{width}}@ohiohouse.gov' if chamber == 'lower' else 'sd{0:0{width}}@ohiosenate.gov').format(int(district), width=2) leg = Person(name=full_name, district=district, primary_org=chamber, image=img) leg.add_contact_detail(type='address', value=address, note='Capitol Office') leg.add_contact_detail(type='voice', value=phone, note='Capitol Office') leg.add_contact_detail(type='email', value=email, note='Capitol Office') leg.add_source(url) leg.add_link(homepage_url) yield leg
def get_member(self, session, chamber, kpid): url = '%smembers/%s' % (ksapi.url, kpid) content = json.loads(self.get(url).text)['content'] party = content['PARTY'] if party == 'Democrat': party = 'Democratic' slug = {'2013-2014': 'b2013_14', '2015-2016': 'b2015_16', '2017-2018': 'b2017_18', '2019-2020': 'b2019_20', }[session] leg_url = 'http://www.kslegislature.org/li/%s/members/%s/' % (slug, kpid) try: legislator_page = self.lxmlize(leg_url) photo_url, = legislator_page.xpath( '//img[@class="profile-picture"]/@src') except scrapelib.HTTPError: self.warning("{}'s legislator bio page not found".format(content['FULLNAME'])) leg_url = '' photo_url = '' person = Person( name=content['FULLNAME'], district=str(content['DISTRICT']), primary_org=chamber, party=party, image=photo_url, ) person.extras = {'occupation': content['OCCUPATION']} address = '\n'.join([ 'Room {}'.format(content['OFFICENUM']), 'Kansas State Capitol Building', '300 SW 10th St.', 'Topeka, KS 66612', ]) note = 'Capitol Office' person.add_contact_detail(type='address', value=address, note=note) person.add_contact_detail(type='email', value=content['EMAIL'], note=note) if content['OFFPH']: person.add_contact_detail(type='voice', value=content['OFFPH'], note=note) person.add_source(url) person.add_link(leg_url) yield person
def scrape_lower_legislator(self, url, leg_info): page = self.lxmlize(url) name = page.xpath( '//span[@id="body_FormView5_FULLNAMELabel"]/text()')[0].strip() if name.startswith("District ") or name.startswith("Vacant "): self.warning("Seat is vacant: {}".format(name)) return photo = page.xpath( '//img[contains(@src, "/h_reps/RepPics")]')[0].attrib["src"] party_flags = { "Democrat": "Democratic", "Republican": "Republican", "Independent": "Independent", } party_info = page.xpath( '//span[@id="body_FormView5_PARTYAFFILIATIONLabel"]/text()' )[0].strip() party = party_flags[party_info] try: email = page.xpath( '//span[@id="body_FormView6_EMAILADDRESSPUBLICLabel"]/text()' )[0].strip() except IndexError: email = None district = leg_info["dist"].replace("Dist", "").strip() person = Person(name=name, party=party, district=district, primary_org="lower", image=photo) contacts = [ (leg_info["office"], "address"), (leg_info["phone"], "voice"), (email, "email"), ] for value, key in contacts: if value: person.add_contact_detail(type=key, value=value, note="District Office") person.add_source(url) person.add_link(url) yield person
def scrape_senator(self, district): link = "https://legislature.maine.gov/District-{}".format(district) page = lxml.html.fromstring(self.get(link).text) page.make_links_absolute(link) main = page.xpath('//div[@id="main"]/div[@id="content"]')[0] title = main.xpath('h1')[0].text # e.g. District 25 - State Senator Catherine Breen (D - Cumberland)... title_match = re.match( r'District (\d+) - State Senator ([^\(]+) \(([DRI])', title) _, name, party = title_match.groups() name = re.sub(r'\s+', ' ', name.strip()) party = _party_map[party] image_url = address = phone = email = None for p in main.xpath('p'): if p.xpath('.//img') and not image_url: image_url = p.xpath('.//img/@src')[0] continue field, _, value = p.text_content().partition(":") value = value.strip() if field in ('Address', 'Mailing Address'): address = value elif field in ('Phone', 'Home Phone'): phone = value elif field == 'Email': email = value person = Person( name=name, district=district, image=image_url, primary_org='upper', party=party, ) person.add_link(link) person.add_source(link) if address: person.add_contact_detail(type='address', value=address, note='District Office') if phone: person.add_contact_detail( type='voice', value=clean_phone(phone), note='District Phone') person.add_contact_detail(type='email', value=email, note='District Email') yield person
def scrape_upper_chamber(self, term): url = 'https://senado.pr.gov/Pages/Senadores.aspx' doc = self.lxmlize(url) links = self.get_nodes(doc, '//ul[@class="senadores-list"]/li/a/@href') for link in links: senator_page = self.lxmlize(link) profile_links = self.get_nodes(senator_page, '//ul[@class="profiles-links"]/li') name_text = self.get_node(senator_page, '//span[@class="name"]').text_content().strip() # Convert to title case as some names are in all-caps name = re.sub(r'^Hon\.', '', name_text, flags=re.IGNORECASE).strip().title() party = profile_links[0].text_content().strip() # Translate to English since being an Independent is a universal construct if party == "Independiente": party = "Independent" photo_url = self.get_node(senator_page, '//div[@class="avatar"]//img/@src') if profile_links[1].text_content().strip() == "Senador por Distrito": district_text = self.get_node( senator_page, '//div[@class="module-distrito"]//span[@class="headline"]').text_content() district = district_text.replace('DISTRITO', '', 1).replace('\u200b', '').strip() elif profile_links[1].text_content().strip() == "Senador por Acumulación": district = "At-Large" phone_node = self.get_node(senator_page, '//a[@class="contact-data tel"]') phone = phone_node.text_content().strip() email_node = self.get_node(senator_page, '//a[@class="contact-data email"]') email = email_node.text_content().replace('\u200b', '').strip() person = Person(primary_org='upper', district=district, name=name, party=party, image=photo_url) person.add_contact_detail(type='email', value=email, note='Capitol Office') person.add_contact_detail(type='voice', value=phone, note='Capitol Office') person.add_link(link) person.add_source(link) yield person
def handle_list_item(self, row): if not row["First Name"]: return name = "{} {}".format(row["First Name"], row["Last Name"]) party = PARTIES[row["Party"]] leg = Person( name=name, district=row["District"].lstrip("0"), party=party, primary_org="upper", role="Senator", image=self.extra_info[name]["image"], ) leg.add_link(self.extra_info[name]["url"]) leg.add_contact_detail( type="voice", value=self.extra_info[name]["office_phone"], note="capitol" ) if "email" in self.extra_info[name]: leg.add_contact_detail( type="email", value=self.extra_info[name]["email"], note="capitol" ) row["Zipcode"] = row["Zipcode"].strip() # Accommodate for multiple address column naming conventions. address1_fields = [row.get("Address"), row.get("Office Building")] address2_fields = [row.get("Address2"), row.get("Office Address")] row["Address"] = next((a for a in address1_fields if a is not None), False) row["Address2"] = next((a for a in address2_fields if a is not None), False) if ( a in row["Address2"] for a in ["95 University Avenue W", "100 Rev. Dr. Martin Luther King"] ): address = "{Address}\n{Address2}\n{City}, {State} {Zipcode}".format(**row) if "Rm. Number" in row: address = "{0} {1}".format(row["Rm. Number"], address) leg.add_contact_detail(type="address", value=address, note="capitol") elif row["Address2"]: address = "{Address}\n{Address2}\n{City}, {State} {Zipcode}".format(**row) leg.add_contact_detail(type="address", value=address, note="district") else: address = "{Address}\n{City}, {State} {Zipcode}".format(**row) leg.add_contact_detail(type="address", value=address, note="district") leg.add_source(self.url) leg.add_source(self._html_url) return leg
def scrape_senator_page(self, chamber, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) for legislator in page.xpath( "//div[@id='senators']//div[contains(concat(' ', normalize-space(@class), ' '), " "' portraitContainer ')]"): img = legislator.xpath(".//div[@class='profileThumbnailBoundingBox']/@style")[0] img = img[img.find('(')+1:img.find(')')] full_name = legislator.xpath(".//div[@class='profileName']/a/text()")[0] homepage_url = legislator.xpath(".//a[@class='profileImageLink']")[0].attrib['href'] district = legislator.xpath(".//div[@class='profileDistrict']" "/a/text()")[0].split("#")[1] if "Vacant" in full_name: continue homepage = self.get(homepage_url).text page = lxml.html.fromstring(homepage) phone = page.xpath("//div[@class='phone']/span/text()")[0] address_lines = page.xpath("//div[@class='address']/span/text()") address = "\n".join(address_lines) party_image = page.xpath('//div[@class="senatorParty"]/img/@src')[0] if 'Republican' in party_image: party = 'Republican' elif 'Democrat' in party_image: party = 'Democratic' email = ( 'rep{0:0{width}}@ohiohouse.gov' if chamber == 'lower' else 'sd{0:0{width}}@ohiosenate.gov' ).format(int(district), width=2) leg = Person(name=full_name, district=district, primary_org=chamber, image=img, party=party) leg.add_contact_detail(type='address', value=address, note='Capitol Office') leg.add_contact_detail(type='voice', value=phone, note='Capitol Office') leg.add_contact_detail(type='email', value=email, note='Capitol Office') leg.add_source(url) leg.add_link(homepage_url) yield leg
def handle_list_item(self, row): if not row['First Name']: return name = '{} {}'.format(row['First Name'], row['Last Name']) party = PARTIES[row['Party']] leg = Person(name=name, district=row['District'].lstrip('0'), party=party, primary_org='upper', role='Senator', image=self.extra_info[name]['image']) leg.add_link(self.extra_info[name]['url']) leg.add_contact_detail(type='voice', value=self.extra_info[name]['office_phone'], note='capitol') if 'email' in self.extra_info[name]: leg.add_contact_detail(type='email', value=self.extra_info[name]['email'], note='capitol') row['Zipcode'] = row['Zipcode'].strip() # Accommodate for multiple address column naming conventions. address1_fields = [row.get('Address'), row.get('Office Building')] address2_fields = [row.get('Address2'), row.get('Office Address')] row['Address'] = next((a for a in address1_fields if a is not None), False) row['Address2'] = next((a for a in address2_fields if a is not None), False) if (a in row['Address2'] for a in ['95 University Avenue W', '100 Rev. Dr. Martin Luther King']): address = ('{Address}\n{Address2}\n{City}, {State} {Zipcode}' .format(**row)) if 'Rm. Number' in row: address = '{0} {1}'.format(row['Rm. Number'], address) leg.add_contact_detail(type='address', value=address, note='capitol') elif row['Address2']: address = ('{Address}\n{Address2}\n{City}, {State} {Zipcode}' .format(**row)) leg.add_contact_detail(type='address', value=address, note='district') else: address = '{Address}\n{City}, {State} {Zipcode}'.format(**row) leg.add_contact_detail(type='address', value=address, note='district') leg.add_source(self.url) leg.add_source(self._html_url) return leg
def scrape_chamber(self, chamber): client = ApiClient(self) session = self.latest_session() base_url = "http://iga.in.gov/legislative" api_base_url = "https://api.iga.in.gov" chamber_name = "senate" if chamber == "upper" else "house" r = client.get("chamber_legislators", session=session, chamber=chamber_name) all_pages = client.unpaginate(r) for leg in all_pages: firstname = leg["firstName"] lastname = leg["lastName"] party = leg["party"] link = leg["link"] api_link = api_base_url+link html_link = base_url+link.replace("legislators/", "legislators/legislator_") try: html = get_with_increasing_timeout(self, html_link, fail=True, kwargs={"verify": False}) except scrapelib.HTTPError: self.logger.warning("Legislator's page is not available.") continue doc = lxml.html.fromstring(html.text) doc.make_links_absolute(html_link) address, phone = doc.xpath("//address") address = address.text_content().strip() address = "\n".join([l.strip() for l in address.split("\n")]) phone = phone.text_content().strip() try: district = doc.xpath("//span[@class='district-heading']" )[0].text.lower().replace("district", "").strip() except IndexError: self.warning("skipping legislator w/o district") continue image_link = base_url+link.replace("legislators/", "portraits/legislator_") legislator = Person(primary_org=chamber, district=district, name=" ".join([firstname, lastname]), party=party, image=image_link) legislator.add_contact_detail(type="address", note="Capitol Office", value=address) legislator.add_contact_detail(type="voice", note="Capitol Office", value=phone) legislator.add_link(html_link) legislator.add_source(html_link) legislator.add_source(api_link) yield legislator
def scrape_chamber(self, chamber): leg_list_url = utils.urls['people'][chamber] page = self.get(leg_list_url).text page = lxml.html.fromstring(page) page.make_links_absolute(leg_list_url) # email addresses are hidden away on a separate page now, at # least for Senators contact_url = utils.urls['contacts'][chamber] contact_page = self.get(contact_url).text contact_page = lxml.html.fromstring(contact_page) for link in page.xpath("//a[contains(@href, '_bio.cfm')]"): full_name = ' '.join(link.text.split(', ')[::-1]) full_name = re.sub(r'\s+', ' ', full_name) district = link.getparent().getnext().tail.strip() district = re.search(r"District (\d+)", district).group(1) party = link.getparent().tail.strip()[-2] if party == 'R': party = 'Republican' elif party == 'D': party = 'Democratic' url = link.get('href') leg_id = url.split('?id=')[1] person = Person(name=full_name, district=district, party=party, primary_org=chamber) person.add_link(leg_list_url) person.add_source(leg_list_url) # Scrape email, offices, photo. page = self.get(url).text doc = lxml.html.fromstring(page) doc.make_links_absolute(url) email = self.scrape_email_address(contact_page, leg_id) self.scrape_offices(url, doc, person, email) self.scrape_photo_url(url, doc, person) yield person
def scrape_chamber(self, chamber): url = { 'upper': 'https://legis.delaware.gov/json/Senate/GetSenators', 'lower': 'https://legis.delaware.gov/json/House/' + 'GetRepresentatives', }[chamber] source_url = { 'upper': 'https://legis.delaware.gov/Senate', 'lower': 'https://legis.delaware.gov/House', }[chamber] data = self.post(url).json()['Data'] for item in data: if item['PersonFullName'] is None: # Vacant district self.warning( 'District {} was detected as vacant'.format( item['DistrictNumber'] ) ) continue leg_url = 'https://legis.delaware.gov/' +\ 'LegislatorDetail?personId={}'.format(item['PersonId']) doc = self.lxmlize(leg_url) image_url = doc.xpath('//img/@src')[0] leg = Person(name=item['PersonFullName'], district=str(item['DistrictNumber']), party=PARTY[item['PartyCode']], primary_org=chamber, image=image_url ) self.scrape_contact_info(leg, doc) leg.add_link(leg_url, note="legislator page") leg.add_source(source_url, note="legislator list page") yield leg
def handle_list_item(self, item): name = item.text if 'resigned' in name.lower() or 'vacated' in name.lower(): return if (name in CHAMBER_MOVES and(self.chamber != CHAMBER_MOVES[name])): return name, action, date = clean_name(name) leg = Person(name=name) leg.add_source(self.url) leg.add_source(item.get('href')) leg.add_link(item.get('href')) yield from self.scrape_page( self.detail_page, item.get('href'), session=self.kwargs['session'], committees=self.kwargs['committees'], obj=leg, ) yield leg
def handle_list_item(self, item): photo_url = item.xpath('./td[1]/a/img/@src')[0] info_nodes = item.xpath('./td[2]/p/a') name_text = info_nodes[0].xpath('./b/text()')[0] url = info_nodes[0].get('href') name_match = re.match(r'^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$', name_text) name = name_match.group(1).strip() district = name_match.group(2).lstrip('0').upper() party_text = name_match.group(3) party = PARTIES[party_text] info_texts = [x.strip() for x in item.xpath( './td[2]/p/text()[normalize-space() and preceding-sibling::br]' ) if x.strip()] address = '\n'.join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if validate_phone_number(phone_text): phone = phone_text email_node = info_nodes[1] email_text = email_node.text email_text = email_text.replace('Email: ', '').strip() if validate_email_address(email_text): email = email_text rep = Person(name=name, district=district, party=party, primary_org='lower', role='Representative', image=photo_url) rep.add_link(url) rep.add_contact_detail(type='address', value=address) rep.add_contact_detail(type='voice', value=phone) rep.add_contact_detail(type='email', value=email) rep.add_source(self.url) yield rep