def scrape(self, chamber, term): if chamber == 'lower': return html = self.urlopen(self.legislators_url) doc = lxml.html.fromstring(html) members = doc.xpath('//div[@id="cdlist"]/div[@class="cd"]') for member in members: member_xpath = member.xpath res = {} title_name = member_xpath('div[@class="cdinfo"]/text()')[0].strip() res['url'] = self.base_url + member_xpath('div[@class="cdinfo"]/a/@href')[0] district = member_xpath('div[@class="cdinfo"]/a')[0].text_content().strip() if len(member_xpath('div[@class="cdinfo"]/a')) > 1: res['email'] = member_xpath('div[@class="cdinfo"]/a')[1].text_content().strip() else: res['email'] = None for t in titles: if t in title_name: res['title'] = t res['full_name'] = title_name.replace(t, '').strip() leg = Legislator(term, chamber, district, **res) leg.update(**res) self.save_legislator( leg )
def scrape_bio(self, term, chamber, district, name, url): # this opens the committee section without having to do another request url += '&TableRow=1.5.5' frame_doc = self.lxmlize(url) actual_url = frame_doc.xpath("//frame[@name='right']/@src")[0] doc = self.lxmlize(actual_url) # party is in one of these party = doc.xpath('//div[@id="page_header"]')[0].text.strip()[-3:] if '(D)' in party: party = 'Democratic' elif '(R)' in party: party = 'Republican' else: raise AssertionError("No party found for {name}".format(name=name)) leg = Legislator(term, chamber, district, name, party=party) photo_url = doc.xpath('//img[contains(@src, "jpg")]/@src') if photo_url: leg['photo_url'] = photo_url[0] contact_info = self.scrape_contact_info(doc) leg.update(contact_info) return leg
def scrape(self, chamber, term): if chamber == 'lower': return html = self.urlopen(self.legislators_url) doc = lxml.html.fromstring(html) table = doc.xpath('//table')[27] row = table.xpath('tr')[1] members = row.xpath('td') for member in members: res = {} member_xpath = member.xpath name = member_xpath('span/b/a/text()') if name: res['full_name'] = name[0] else: res['full_name'] = member_xpath('a/span/b/text()')[0].strip() res['url'] = member_xpath('span/a/@href')[0].strip() district = member_xpath('span/a')[0].text_content().strip() title_el = member_xpath('span/a/text()') if len(title_el) > 1: res['title'] = title_el[1].strip() else: res['title'] = "" leg = Legislator(term, chamber, district, **res) leg.update(**res) self.save_legislator( leg )
def scrape(self, chamber, term): self.validate_term(term, latest_only=True) chamber_name = {"upper": "Senate", "lower": "House"}[chamber] url = "http://www.in.gov/cgi-bin/legislative/listing/" "listing-2.pl?data=alpha&chamber=%s" % chamber_name with self.urlopen(url) as page: page = lxml.html.fromstring(page) for link in page.xpath("//div[@id='col2']/p/a"): name = link.text.strip() href = link.get("href") details = link.getnext().text.strip() party = details.split(",")[0] if party == "Democrat": party = "Democratic" district = re.search(r"District (\d+)", details).group(1) district = district.lstrip("0") # Get the legislator's bio page. leg = Legislator(term, chamber, district, name, party=party, url=href) leg.add_source(url) leg.add_source(href) details = self.scrape_details(chamber, term, href, page, party, leg) if details: leg.update(details) self.save_legislator(leg)
def scrape_bio(self, term, chamber, district, name, url): # this opens the committee section without having to do another request url += '&TableRow=1.5.5' frame_doc = self.lxmlize(url) actual_url = frame_doc.xpath("//frame[@name='right']/@src")[0] doc = self.lxmlize(actual_url) # party is in one of these party = doc.xpath('//div[@id="page_header"]')[0].text.strip()[-3:] if '(D)' in party: party = 'Democratic' elif '(R)' in party: party = 'Republican' else: raise AssertionError("No party found for {name}".format(name=name)) leg = Legislator(term, chamber, district, name, party=party) photo_url = doc.xpath('//img[contains(@src, "jpg")]/@src') if photo_url: leg['photo_url'] = photo_url[0] contact_info = self.scrape_contact_info(doc) leg.update(contact_info) return leg
def process_person(self, person): term = self.metadata['terms'][-1]['name'] chamber = None district = None party = None name = person['name'] url = person['links'][0]['url'] photo_url = person['image'] for membership in self.memberships[person['_id']]: org = membership['org'] post = membership['post'] if not org: print(membership) classification = org.get('classification') or org.get( 'organization__classification') if classification in ('upper', 'lower'): chamber = classification district = post['label'] elif classification == 'party': party = org['name'] elif classification == 'legislature': # DC chamber = 'upper' district = post['label'] district_office = {} capitol_office = {} email = '' for detail in person['contact_details']: # rename voice->phone if detail['type'] == 'voice': detail['type'] = 'phone' elif detail['type'] == 'email': email = detail['value'] if 'district' in detail['note'].lower(): district_office[detail['type']] = detail['value'] elif 'capitol' in detail['note'].lower(): capitol_office[detail['type']] = detail['value'] leg = Legislator(term, chamber, district, name, party=party, url=url, photo_url=photo_url, email=email) if district_office: leg.add_office('district', 'District Office', **district_office) if capitol_office: leg.add_office('capitol', 'Capitol Office', **capitol_office) for source in person['sources']: leg.add_source(source['url']) leg.update(**person['extras']) self.save_legislator(leg)
def scrape_legislator(self, chamber, term, option): url = urlparse.urljoin(self.url, option.attrib['value']) name, party, district = re.split(r'\s*,\s*', option.text.strip()) name = re.sub(r'^(Sen\.|Rep\.)\s+', '', name) district = re.sub(r'^District\s+', '', district) if district == '[N/A]': msg = 'No district found for %r; skipping.' self.logger.warning(msg, name) return leg = Legislator(term, chamber, district, name, party=party) # Scrape leg page. try: html = self.urlopen(url) except scrapelib.HTTPError as exc: # As of July 2014, this only happens when a page has # gone missing from their varnish server. # if exc.response.status_code is 503: self.logger.exception(exc) self.logger.warning('Skipping legislator at url: %s' % url) skipped = True return doc = lxml.html.fromstring(html) doc.make_links_absolute(self.url) leg.add_source(url) # Scrape committees. for tr in doc.xpath( '//div[@class="legislator-committees-container"]//table//tr'): committee, committee_type, role = tr committee = committee.text_content().strip() role = role.text_content().strip() if 'member' in role.lower(): role = 'committee member' elif 'chair' in role.lower(): role = 'chair' if committee != "Committee Name": leg.add_role(role, term, chamber=chamber, committee=committee) # Scrape offices. dist_office, phone = doc.xpath('//address') dist_office = dist_office.text_content().strip() dist_office = re.sub(r' {2,}', '', dist_office) phone = phone.text_content().strip() email = doc.xpath('string(//a[starts-with(@href, "mailto:")]/@href)') photo_url = doc.xpath('string(//img[contains(@class, "member")]/@src)') leg.update(email=email, photo_url=photo_url) leg.add_office(address=dist_office, name='Capitol Office', type='capitol', phone=phone) self.save_legislator(leg)
def process_person(self, person): term = self.metadata['terms'][-1]['name'] chamber = None district = None party = None name = person['name'] url = person['links'][0]['url'] photo_url = person['image'] for membership in self.memberships[person['_id']]: org = membership['org'] post = membership['post'] if not org: print(membership) classification = org.get('classification') or org.get('organization__classification') if classification in ('upper', 'lower'): chamber = classification district = post['label'] elif classification == 'party': party = org['name'] elif classification == 'legislature': # DC chamber = 'upper' district = post['label'] district_office = {} capitol_office = {} email = '' for detail in person['contact_details']: # rename voice->phone if detail['type'] == 'voice': detail['type'] = 'phone' elif detail['type'] == 'email': email = detail['value'] if 'district' in detail['note'].lower(): district_office[detail['type']] = detail['value'] elif 'capitol' in detail['note'].lower(): capitol_office[detail['type']] = detail['value'] leg = Legislator(term, chamber, district, name, party=party, url=url, photo_url=photo_url, email=email ) if district_office: leg.add_office('district', 'District Office', **district_office) if capitol_office: leg.add_office('capitol', 'Capitol Office', **capitol_office) for source in person['sources']: leg.add_source(source['url']) leg.update(**person['extras']) self.save_legislator(leg)
def scrape_legislator(self, chamber, term, option): url = urlparse.urljoin(self.url, option.attrib["value"]) name, party, district = re.split(r"\s*,\s*", option.text.strip()) name = re.sub(r"^(Sen\.|Rep\.)\s+", "", name) district = re.sub(r"^District\s+", "", district) if district == "[N/A]": msg = "No district found for %r; skipping." self.logger.warning(msg, name) return leg = Legislator(term, chamber, district, name, party=party) leg.add_source(self.url) # Scrape leg page. try: html = self.urlopen(url) except scrapelib.HTTPError as exc: # As of July 2014, this only happens when a page has # gone missing from their varnish server. # if exc.response.status_code is 503: self.logger.exception(exc) self.logger.warning("Skipping legislator at url: %s" % url) skipped = True return doc = lxml.html.fromstring(html) doc.make_links_absolute(self.url) leg.add_source(url) # Scrape committees. for tr in doc.xpath("//table//tr"): committee, role = tr committee = committee.text_content().strip() role = role.text_content().strip() if "member" in role.lower(): role = "committee member" elif "chair" in role.lower(): role = "chair" leg.add_role(role, term, chamber=chamber, committee=committee) # Scrape offices. dist_office, phone = doc.xpath("//address") dist_office = dist_office.text_content().strip() dist_office = re.sub(r" {2,}", "", dist_office) phone = phone.text_content().strip() email = doc.xpath('string(//a[starts-with(@href, "mailto:")]/@href)') photo_url = doc.xpath('string(//img[contains(@class, "member")]/@src)') leg.update(email=email, photo_url=photo_url) leg.add_office(address=dist_office, name="District Office", type="district", phone=phone) self.save_legislator(leg)
def scrape_bio(self, term, chamber, district, name, url): # this opens the committee section without having to do another request url += '&TableRow=1.5.5' doc = lxml.html.fromstring(self.urlopen(url)) doc.make_links_absolute(url) # party is in one of these party = doc.xpath('//div[@align="center"]/b/font[@size="2"]/text()') if '(D)' in party: party = 'Democratic' elif '(R)' in party: party = 'Republican' leg = Legislator(term, chamber, district, name, party=party, url=url) photo_url = doc.xpath('//img[contains(@src, "FieldElemFormat")]/@src') if photo_url: leg['photo_url'] = photo_url[0] roles = defaultdict(lambda: {}) position = 'member' for text in doc.xpath('//td[@width="584"]/descendant::font/text()'): text = text.strip() if text == 'Committee Chair:': position = 'chair' elif text == 'Committee Co-chair:': position = 'co-chair' else: for committee in text.splitlines(): roles[committee].update( role='committee member', term=term, chamber=chamber, committee=committee, party=party, position=position) for role in roles.values(): leg.add_role(**role) contact_info = self.scrape_contact_info(doc) leg.update(contact_info) return leg
def scrape_bio(self, term, chamber, district, name, url): # this opens the committee section without having to do another request url += '&TableRow=1.5.5' doc = lxml.html.fromstring(self.urlopen(url)) doc.make_links_absolute(url) # party is in one of these party = doc.xpath('//div[@align="center"]/b/font[@size="2"]/text()') if '(D)' in party: party = 'Democratic' elif '(R)' in party: party = 'Republican' leg = Legislator(term, chamber, district, name, party=party, url=url) photo_url = doc.xpath('//img[contains(@src, "FieldElemFormat")]/@src') if photo_url: leg['photo_url'] = photo_url[0] roles = defaultdict(lambda: {}) position = 'member' for text in doc.xpath('//td[@width="584"]/descendant::font/text()'): text = text.strip() if text == 'Committee Chair:': position = 'chair' elif text == 'Committee Co-chair:': position = 'co-chair' else: for committee in text.splitlines(): roles[committee].update( role='committee member', term=term, chamber=chamber, committee=committee, party=party, position=position) for role in roles.values(): leg.add_role(**role) contact_info = self.scrape_contact_info(doc) leg.update(contact_info) return leg
def scrape(self, chamber, term): if chamber == 'lower': return html = self.urlopen(self.legislators_url) doc = lxml.html.fromstring(html) cells = doc.xpath('//table/tbody/tr/td') for cell in cells: cell_xpath = cell.xpath res = {} res['full_name'] = cell_xpath('a')[0].text_content() res['email'] = cell_xpath('a/@href')[1].replace('mailto:','') res['title'] = cell_xpath('text()')[0].strip() res['phone'] = cell_xpath('text()')[1].strip() res['url'] = self.base_url + cell_xpath('a/@href')[0] leg = Legislator(term, chamber, 'district', **res) print leg leg.update(**res) self.save_legislator( leg )
def scrape_legislator(self, chamber, term, option): url = urlparse.urljoin(self.url, option.attrib['value']) name, party, district = re.split(r'\s*,\s*', option.text.strip()) name = re.sub(r'^(Sen\.|Rep\.)\s+', '', name) district = re.sub(r'^District\s+', '', district) if district == '[N/A]': msg = 'No district found for %r; skipping.' self.logger.warning(msg, name) return leg = Legislator(term, chamber, district, name, party=party) leg.add_source(self.url) # Scrape leg page. html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(self.url) leg.add_source(url) # Scrape committees. for tr in doc.xpath('//table//tr'): committee, role = tr committee = committee.text_content().strip() role = role.text_content().strip() if 'member' in role.lower(): role = 'committee member' elif 'chair' in role.lower(): role = 'chair' leg.add_role(role, term, chamber=chamber, committee=committee) # Scrape offices. dist_office, phone = doc.xpath('//address') dist_office = dist_office.text_content().strip() dist_office = re.sub(r' {2,}', '', dist_office) phone = phone.text_content().strip() email = doc.xpath('string(//a[starts-with(@href, "mailto:")]/@href)') photo_url = doc.xpath('string(//img[contains(@class, "member")]/@src)') leg.update(email=email, photo_url=photo_url) leg.add_office( address=dist_office, name='District Office', type='district', phone=phone) self.save_legislator(leg)
def scrape(self, chamber, term): self.validate_term(term, latest_only=True) chamber_name = {'upper': 'Senate', 'lower': 'House'}[chamber] url = ("http://www.in.gov/cgi-bin/legislative/listing/" "listing-2.pl?data=alpha&chamber=%s" % chamber_name) page = self.urlopen(url) page = lxml.html.fromstring(page) for link in page.xpath("//div[@id='col2']/p/a"): name = link.text.strip() href = link.get('href') details = link.getnext().text.strip() party = details.split(',')[0] if party == 'Democrat': party = 'Democratic' district = re.search(r'District (\d+)', details).group(1) district = district.lstrip('0') # Get the legislator's bio page. leg = Legislator(term, chamber, district, name, party=party, url=href) leg.add_source(url) leg.add_source(href) details = self.scrape_details(chamber, term, href, page, party, leg) if details: leg.update(details) self.fix_hotgarbage(leg) self.save_legislator(leg)
def scrape_legislator(self, chamber, term, option): url = urlparse.urljoin(self.url, option.attrib['value']) name, party, district = re.split(r'\s*,\s*', option.text.strip()) name = re.sub(r'^(Sen\.|Rep\.)\s+', '', name) district = re.sub(r'^District\s+', '', district) leg = Legislator(term, chamber, district, name, party=party) leg.add_source(self.url) # Scrape leg page. html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(self.url) leg.add_source(url) # Scrape committees. for tr in doc.xpath('//table//tr'): committee, role = tr committee = committee.text_content().strip() role = role.text_content().strip() if 'member' in role.lower(): role = 'committee member' elif 'chair' in role.lower(): role = 'chair' leg.add_role(role, term, chamber=chamber, committee=committee) # Scrape offices. dist_office, phone = doc.xpath('//address') dist_office = dist_office.text_content().strip() dist_office = re.sub(r' {2,}', '', dist_office) phone = phone.text_content().strip() email = doc.xpath('string(//a[starts-with(@href, "mailto:")]/@href)') photo_url = doc.xpath('string(//img[contains(@class, "member")]/@src)') leg.update(email=email, photo_url=photo_url) leg.add_office( address=dist_office, name='District Office', type='district', phone=phone) self.save_legislator(leg)
def scrape_bio(self, term, chamber, district, name, url): # this opens the committee section without having to do another request url += '&TableRow=1.5.5' doc = lxml.html.fromstring(self.urlopen(url)) doc.make_links_absolute(url) # party is in one of these party = doc.xpath('//div[@align="center"]/b/font[@size="2"]/text()') if '(D)' in party: party = 'Democratic' elif '(R)' in party: party = 'Republican' leg = Legislator(term, chamber, district, name, party=party, url=url) photo_url = doc.xpath('//img[contains(@src, "FieldElemFormat")]/@src') if photo_url: leg['photo_url'] = photo_url[0] contact_info = self.scrape_contact_info(doc) leg.update(contact_info) return leg
def scrape_bio(self, term, chamber, district, name, url): # this opens the committee section without having to do another request url += '&TableRow=1.5.5' doc = lxml.html.fromstring(self.urlopen(url)) doc.make_links_absolute(url) # party is in one of these party = doc.xpath('//div[@align="center"]/b/font[@size="2"]/text()') if '(D)' in party: party = 'Democratic' elif '(R)' in party: party = 'Republican' leg = Legislator(term, chamber, district, name, party=party, url=url) photo_url = doc.xpath('//img[contains(@src, "FieldElemFormat")]/@src') if photo_url: leg['photo_url'] = photo_url[0] contact_info = self.scrape_contact_info(doc) leg.update(contact_info) return leg
def scrape(self, chamber, term): self.validate_term(term, latest_only=True) chamber_name = {'upper': 'Senate', 'lower': 'House'}[chamber] url = ("http://www.in.gov/cgi-bin/legislative/listing/" "listing-2.pl?data=alpha&chamber=%s" % chamber_name) page = self.urlopen(url) page = lxml.html.fromstring(page) for link in page.xpath("//div[@id='col2']/p/a"): name = link.text.strip() href = link.get('href') details = link.getnext().text.strip() party = details.split(',')[0] if party == 'Democrat': party = 'Democratic' district = re.search(r'District (\d+)', details).group(1) district = district.lstrip('0') # Get the legislator's bio page. leg = Legislator(term, chamber, district, name, party=party, url=href) leg.add_source(url) leg.add_source(href) details = self.scrape_details(chamber, term, href, page, party, leg) if details: leg.update(details) self.fix_hotgarbage(leg) self.save_legislator(leg)
def scrape(self, chamber, term): for tdata in self.metadata["terms"]: if term == tdata["name"]: year = tdata["start_year"] session_number = tdata["session_number"] break # Fetch the csv. url = "http://leg.mt.gov/content/sessions/%s/%d%sMembers.txt" % ( session_number, year, chamber == "upper" and "Senate" or "House", ) # Parse it. data = self.urlopen(url) data = data.replace('"""', '"') # weird triple quotes data = data.splitlines() fieldnames = ["last_name", "first_name", "party", "district", "address", "city", "state", "zip"] csv_parser = csv.DictReader(data, fieldnames) district_leg_urls = self._district_legislator_dict() # Toss the row headers. next(csv_parser) for entry in csv_parser: if not entry: continue # City. entry["city"] = entry["city"] # Address. entry["address"] = entry["address"] # District. district = entry["district"] hd_or_sd, district = district.split() del entry["district"] # Party. party_letter = entry["party"] party = {"D": "Democratic", "R": "Republican"}[party_letter] entry["party"] = party del entry["party"] # Get full name properly capped. fullname = _fullname = "%s %s" % (entry["first_name"].capitalize(), entry["last_name"].capitalize()) city_lower = entry["city"].lower() # Get any info at the legislator's detail_url. detail_url = district_leg_urls[hd_or_sd][district] # Get the office. address = "\n".join([entry["address"], "%s, %s %s" % (entry["city"], entry["state"], entry["zip"])]) office = dict(name="District Office", type="district", phone=None, fax=None, email=None, address=address) try: deets = self._scrape_details(detail_url) except NoDetails: self.logger.warning("No details found at %r" % detail_url) continue # Add the details and delete junk. entry.update(deets) del entry["first_name"], entry["last_name"] legislator = Legislator(term, chamber, district, fullname, party=party) legislator.update(entry) legislator.add_source(detail_url) legislator.add_source(url) legislator["url"] = detail_url office["phone"] = deets.get("phone") office["fax"] = deets.get("fax") legislator.add_office(**office) self.save_legislator(legislator)
def scrape(self, chamber, term): term_slug = term[:-2] url = MEMBER_LIST_URL[chamber] % term_slug html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) for row in doc.xpath('//table')[4].xpath('tr')[2:]: name, _, _, district, party = row.xpath('td') district = district.text if party.text_content().strip() == "": self.warning("Garbage party: Skipping!") continue party = {'D':'Democratic', 'R': 'Republican', 'I': 'Independent'}[party.text] leg_url = name.xpath('a/@href')[0] name = name.text_content().strip() # inactive legislator, skip them for now if name.endswith('*'): name = name.strip('*') continue leg_html = self.get(leg_url).text leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(leg_url) leg = Legislator(term, chamber, district, name, party=party, url=leg_url) leg.add_source(url) hotgarbage = ( 'Senate Biography Information for the 98th General ' 'Assembly is not currently available.') if hotgarbage in leg_html: # The legislator's bio isn't available yet. self.logger.warning('No legislator bio available for ' + name) self.save_legislator(leg) continue photo_url = leg_doc.xpath('//img[contains(@src, "/members/")]/@src')[0] photo_url_parsed = urlparse(photo_url) encoded_path = quote(photo_url_parsed.path) photo_url = photo_url_parsed._replace(path=encoded_path).geturl() leg.update(photo_url=photo_url) leg.add_source(leg_url) # email email = leg_doc.xpath('//b[text()="Email: "]') if email: email = email[0].tail.strip() else: email = None # function for turning an IL contact info table to office details def _table_to_office(table, office_type, office_name, email=None): addr = '' phone = '' fax = None for row in table.xpath('tr'): row = row.text_content().strip() # skip rows that aren't part of address if 'Office:' in row or row == 'Cook County': continue # fax number row ends with FAX elif 'FAX' in row: fax = row.replace(' FAX', '') # phone number starts with ( [make it more specific?] elif row.startswith('('): phone = row # everything else is an address else: addr += (row + '\n') if addr.strip() != ',': leg.add_office(office_type, office_name, address=addr.strip(), phone=phone, fax=fax, email=email) # extract both offices from tables table = leg_doc.xpath('//table[contains(string(), "Springfield Office")]') if table: _table_to_office(table[3], 'capitol', 'Springfield Office', email) table = leg_doc.xpath('//table[contains(string(), "District Office")]') if table: _table_to_office(table[3], 'district', 'District Office') self.save_legislator(leg)
def scrape(self, chamber, term): for tdata in self.metadata['terms']: if term == tdata['name']: year = tdata['start_year'] session_number = tdata['session_number'] break # Scrape committees. Also produce a name dictionary that can be # used for fuzzy matching between the committee page names and the # all-caps csv names. for name_dict, _ in scrape_committees(year, chamber): pass # Fetch the csv. url = 'http://leg.mt.gov/content/sessions/%s/%d%sMembers.txt' % \ (session_number, year, chamber == 'upper' and 'Senate' or 'House') # Parse it. data = self.urlopen(url) data = data.replace('"""', '"') # weird triple quotes data = data.splitlines() fieldnames = [ 'last_name', 'first_name', 'party', 'district', 'address', 'city', 'state', 'zip' ] csv_parser = csv.DictReader(data, fieldnames) district_leg_urls = self._district_legislator_dict() for entry in csv_parser: if not entry: continue # City. entry['city'] = entry['city'].title() # Address. entry['address'] = entry['address'].title() # District. district = entry['district'] hd_or_sd, district = district.split() del entry['district'] # Party. party_letter = entry['party'] party = {'D': 'Democratic', 'R': 'Republican'}[party_letter] entry['party'] = party del entry['party'] # Get full name properly capped. _fullname = '%s %s' % (entry['first_name'].capitalize(), entry['last_name'].capitalize()) city_lower = entry['city'].lower() fullname = difflib.get_close_matches(_fullname, name_dict[city_lower], cutoff=0.5) # If there are no close matches with the committee page, # use the title-capped first and last name. if len(fullname) < 1: fullname = _fullname # msg = 'No matches found for "%s" with "%s" from %r' # self.debug(msg % (_fullname, fullname, # name_dict[city_lower])) else: fullname = fullname[0] # if _fullname != fullname: # msg = 'matched "%s" with "%s" from %r' # self.debug(msg % (_fullname, fullname, # name_dict[city_lower])) # Get any info at the legislator's detail_url. detail_url = district_leg_urls[hd_or_sd][district] deets = self._scrape_details(detail_url) # Add the details and delete junk. entry.update(deets) del entry['first_name'], entry['last_name'] legislator = Legislator(term, chamber, district, fullname, party=party) legislator.update(entry) legislator.add_source(detail_url) legislator.add_source(url) legislator['url'] = detail_url self.save_legislator(legislator)
def scrape(self, chamber, term): for tdata in self.metadata['terms']: if term == tdata['name']: year = tdata['start_year'] session_number = tdata['session_number'] break # Fetch the csv. url = 'http://leg.mt.gov/content/sessions/%s/%d%sMembers.txt' % \ (session_number, year, chamber == 'upper' and 'Senate' or 'House') # Parse it. data = self.urlopen(url) data = data.replace('"""', '"') # weird triple quotes data = data.splitlines() fieldnames = [ 'last_name', 'first_name', 'party', 'district', 'address', 'city', 'state', 'zip' ] csv_parser = csv.DictReader(data, fieldnames) district_leg_urls = self._district_legislator_dict() # Toss the row headers. next(csv_parser) for entry in csv_parser: if not entry: continue # City. entry['city'] = entry['city'] # Address. entry['address'] = entry['address'] # District. district = entry['district'] hd_or_sd, district = district.split() del entry['district'] # Party. party_letter = entry['party'] party = {'D': 'Democratic', 'R': 'Republican'}[party_letter] entry['party'] = party del entry['party'] # Get full name properly capped. fullname = _fullname = '%s %s' % (entry['first_name'].capitalize(), entry['last_name'].capitalize()) city_lower = entry['city'].lower() # Get any info at the legislator's detail_url. detail_url = district_leg_urls[hd_or_sd][district] # Get the office. address = '\n'.join([ entry['address'], '%s, %s %s' % (entry['city'], entry['state'], entry['zip']) ]) office = dict(name='District Office', type='district', phone=None, fax=None, email=None, address=address) deets = self._scrape_details(detail_url) # Add the details and delete junk. entry.update(deets) del entry['first_name'], entry['last_name'] legislator = Legislator(term, chamber, district, fullname, party=party) legislator.update(entry) legislator.add_source(detail_url) legislator.add_source(url) legislator['url'] = detail_url office['phone'] = deets.get('phone') office['fax'] = deets.get('fax') legislator.add_office(**office) self.save_legislator(legislator)
def scrape(self, chamber, term): term_slug = term[:-2] url = MEMBER_LIST_URL[chamber] % term_slug html = self.urlopen(url) doc = lxml.html.fromstring(html) doc.make_links_absolute(url) for row in doc.xpath('//table')[4].xpath('tr')[2:]: name, _, _, district, party = row.xpath('td') district = district.text if party.text_content().strip() == "": self.warning("Garbage party: Skipping!") continue party = { 'D': 'Democratic', 'R': 'Republican', 'I': 'Independent' }[party.text] leg_url = name.xpath('a/@href')[0] name = name.text_content().strip() # inactive legislator, skip them for now if name.endswith('*'): name = name.strip('*') continue leg_html = self.urlopen(leg_url) leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(leg_url) leg = Legislator(term, chamber, district, name, party=party, url=leg_url) leg.add_source(url) hotgarbage = ('Senate Biography Information for the 98th General ' 'Assembly is not currently available.') if hotgarbage in leg_html: # The legislator's bio isn't available yet. self.logger.warning('No legislator bio available for ' + name) self.save_legislator(leg) continue photo_url = leg_doc.xpath( '//img[contains(@src, "/members/")]/@src')[0] photo_url_parsed = urlparse(photo_url) encoded_path = quote(photo_url_parsed.path) photo_url = photo_url_parsed._replace(path=encoded_path).geturl() leg.update(photo_url=photo_url) leg.add_source(leg_url) # email email = leg_doc.xpath('//b[text()="Email: "]') if email: leg['email'] = email[0].tail # function for turning an IL contact info table to office details def _table_to_office(table, office_type, office_name): addr = '' phone = '' fax = None for row in table.xpath('tr'): row = row.text_content().strip() # skip rows that aren't part of address if 'Office:' in row or row == 'Cook County': continue # fax number row ends with FAX elif 'FAX' in row: fax = row.replace(' FAX', '') # phone number starts with ( [make it more specific?] elif row.startswith('('): phone = row # everything else is an address else: addr += (row + '\n') if addr.strip() != ',': leg.add_office(office_type, office_name, address=addr.strip(), phone=phone, fax=fax) # extract both offices from tables table = leg_doc.xpath( '//table[contains(string(), "Springfield Office")]') if table: _table_to_office(table[3], 'capitol', 'Springfield Office') table = leg_doc.xpath( '//table[contains(string(), "District Office")]') if table: _table_to_office(table[3], 'district', 'District Office') self.save_legislator(leg)
def scrape(self, chamber, term): for tdata in self.metadata['terms']: if term == tdata['name']: year = tdata['start_year'] session_number = tdata['session_number'] break # Scrape committees. Also produce a name dictionary that can be # used for fuzzy matching between the committee page names and the # all-caps csv names. # for name_dict, _ in scrape_committees(year, chamber): # pass # Fetch the csv. url = 'http://leg.mt.gov/content/sessions/%s/%d%sMembers.txt' % \ (session_number, year, chamber == 'upper' and 'Senate' or 'House') # Parse it. data = self.urlopen(url) data = data.replace('"""', '"') # weird triple quotes data = data.splitlines() fieldnames = ['last_name', 'first_name', 'party', 'district', 'address', 'city', 'state', 'zip'] csv_parser = csv.DictReader(data, fieldnames) district_leg_urls = self._district_legislator_dict() # Toss the row headers. next(csv_parser) for entry in csv_parser: if not entry: continue # City. entry['city'] = entry['city'] # Address. entry['address'] = entry['address'] # District. district = entry['district'] hd_or_sd, district = district.split() del entry['district'] # Party. party_letter = entry['party'] party = {'D': 'Democratic', 'R': 'Republican'}[party_letter] entry['party'] = party del entry['party'] # Get full name properly capped. fullname = _fullname = '%s %s' % (entry['first_name'].capitalize(), entry['last_name'].capitalize()) city_lower = entry['city'].lower() # fullname = difflib.get_close_matches( # _fullname, name_dict[city_lower], cutoff=0.5) # If there are no close matches with the committee page, # use the title-capped first and last name. # if len(fullname) < 1: # fullname = _fullname # # msg = 'No matches found for "%s" with "%s" from %r' # # self.debug(msg % (_fullname, fullname, # # name_dict[city_lower])) # else: # fullname = fullname[0] # # if _fullname != fullname: # # msg = 'matched "%s" with "%s" from %r' # # self.debug(msg % (_fullname, fullname, # # name_dict[city_lower])) # Get any info at the legislator's detail_url. detail_url = district_leg_urls[hd_or_sd][district] # Get the office. address = '\n'.join([ entry['address'], '%s, %s %s' % (entry['city'], entry['state'], entry['zip']) ]) office = dict( name='District Office', type='district', phone=None, fax=None, email=None, address=address) deets = self._scrape_details(detail_url) # import ipdb;ipdb.set_trace() # Add the details and delete junk. entry.update(deets) del entry['first_name'], entry['last_name'] legislator = Legislator(term, chamber, district, fullname, party=party) legislator.update(entry) legislator.add_source(detail_url) legislator.add_source(url) legislator['url'] = detail_url office['phone'] = deets.get('phone') office['fax'] = deets.get('fax') legislator.add_office(**office) print legislator['url'] print legislator['photo_url'] x = re.search(r'(\d+)\.jpg', legislator['photo_url']) if x: y = x.group(1) if y not in legislator['url']: import ipdb;ipdb.set_trace() else: import ipdb;ipdb.set_trace() self.save_legislator(legislator)
def scrape(self, chamber, term): for tdata in self.metadata['terms']: if term == tdata['name']: year = tdata['start_year'] session_number = tdata['session_number'] break # Scrape committees. Also produce a name dictionary that can be # used for fuzzy matching between the committee page names and the # all-caps csv names. for name_dict, _ in scrape_committees(year, chamber): pass # Fetch the csv. url = 'http://leg.mt.gov/content/sessions/%s/%d%sMembers.txt' % \ (session_number, year, chamber == 'upper' and 'Senate' or 'House') # Parse it. data = self.urlopen(url) data = data.replace('"""', '"') # weird triple quotes data = data.splitlines() fieldnames = ['last_name', 'first_name', 'party', 'district', 'address', 'city', 'state', 'zip'] csv_parser = csv.DictReader(data, fieldnames) district_leg_urls = self._district_legislator_dict() for entry in csv_parser: if not entry: continue # City. entry['city'] = entry['city'].title() # Address. entry['address'] = entry['address'].title() # District. district = entry['district'] hd_or_sd, district = district.split() del entry['district'] # Party. party_letter = entry['party'] party = {'D': 'Democratic', 'R': 'Republican'}[party_letter] entry['party'] = party del entry['party'] # Get full name properly capped. _fullname = '%s %s' % (entry['first_name'].capitalize(), entry['last_name'].capitalize()) city_lower = entry['city'].lower() fullname = difflib.get_close_matches( _fullname, name_dict[city_lower], cutoff=0.5) # If there are no close matches with the committee page, # use the title-capped first and last name. if len(fullname) < 1: fullname = _fullname # msg = 'No matches found for "%s" with "%s" from %r' # self.debug(msg % (_fullname, fullname, # name_dict[city_lower])) else: fullname = fullname[0] # if _fullname != fullname: # msg = 'matched "%s" with "%s" from %r' # self.debug(msg % (_fullname, fullname, # name_dict[city_lower])) # Get any info at the legislator's detail_url. detail_url = district_leg_urls[hd_or_sd][district] deets = self._scrape_details(detail_url) # Add the details and delete junk. entry.update(deets) del entry['first_name'], entry['last_name'] legislator = Legislator(term, chamber, district, fullname, party=party) legislator.update(entry) legislator.add_source(detail_url) legislator.add_source(url) legislator['url'] = detail_url self.save_legislator(legislator)
def parse_legislator(self, tr, term, chamber): """ Given a tr element, get specific data from it. """ strip = methodcaller("strip") xpath = 'td[contains(@class, "views-field-field-%s-%s")]%s' xp = { "url": [("lname-value-1", "/a/@href"), ("member-lname-value-1", "/a/@href")], "district": [("district-value", "/text()")], "party": [("party-value", "/text()")], "full_name": [("feedbackurl-value", "/a/text()")], "address": [("feedbackurl-value", "/p/text()"), ("feedbackurl-value", "/p/font/text()")], } titles = {"upper": "senator", "lower": "member"} funcs = {"full_name": lambda s: s.replace("Contact Senator", "").strip(), "address": parse_address} rubberstamp = lambda _: _ tr_xpath = tr.xpath res = collections.defaultdict(list) for k, xpath_info in xp.items(): for vals in xpath_info: f = funcs.get(k, rubberstamp) vals = (titles[chamber],) + vals vals = map(f, map(strip, tr_xpath(xpath % vals))) res[k].extend(vals) # Photo. try: res["photo_url"] = tr_xpath("td/p/img/@src")[0] except IndexError: pass # Addresses. addresses = res["address"] try: addresses = map(dict, filter(None, addresses)) except ValueError: # Sometimes legislators only have one address, in which # case this awful hack is helpful. addresses = map(dict, filter(None, [addresses])) for address in addresses[:]: # Toss results that don't have required keys. if not set(["street", "city", "zip"]) < set(address): if address in addresses: addresses.remove(address) # Re-key the addresses offices = [] if addresses: # Mariko Yamada's addresses wouldn't parse correctly as of # 3/23/2013, so here we're forced to test whether any # addresses were even found. addresses[0].update(type="capitol", name="Capitol Office") offices.append(addresses[0]) for office in addresses[1:]: office.update(type="district", name="District Office") offices.append(office) for office in offices: street = office["street"] street = "%s\n%s, %s %s" % (street, office["city"], "CA", office["zip"]) office["address"] = street office["fax"] = None office["email"] = None del office["street"], office["city"], office["zip"] res["offices"] = offices del res["address"] # Remove junk from assembly member names. junk = "Contact Assembly Member " res["full_name"] = res["full_name"].pop().replace(junk, "") # Normalize party. for party in res["party"][:]: if party: if party == "Democrat": party = "Democratic" res["party"] = party break else: res["party"] = None # Mariko Yamada also didn't have a url that lxml would parse # as of 3/22/2013. if res["url"]: res["url"] = res["url"].pop() else: del res["url"] # strip leading zero res["district"] = str(int(res["district"].pop())) # Add a source for the url. leg = Legislator(term, chamber, **res) leg.update(**res) return leg
def scrape(self, term, chambers): # The mayor doesn't sit on council. url = 'http://www.phila.gov/' doc = lxml.html.fromstring(self.urlopen(url)) doc.make_links_absolute(url) # The mayor's name doesn't appear on the mayor's page! full_name = re.search('Mayor (.+)', doc.xpath('//title/text()')[0].strip()).group(1) first_name, middle_name, last_name = parse_full_name(full_name) mayor = Person(full_name, first_name, last_name, middle_name) mayor.add_source(url) url = 'http://www.phila.gov/mayor/' doc = lxml.html.fromstring(self.urlopen(url)) doc.make_links_absolute(url) lines = map(clean_string, doc.xpath('//div[contains(text(),"Mailing Address")]/following-sibling::text()')[1:]) address = '\n'.join(lines) phone = '-'.join(tel_regex.search(doc.xpath('//strong[contains(text(),"Phone")]/following-sibling::text()[1]')[0]).groups()) fax = '-'.join(tel_regex.search(doc.xpath('//strong[contains(text(),"Fax")]/following-sibling::text()[1]')[0]).groups()) email = clean_string(doc.xpath('//strong[contains(text(),"Email")]/following-sibling::text()[1]')[0]) mayor.update(dict(url=url, email=email)) mayor.add_office('capitol', 'Office of the Mayor', address=address, phone=phone, fax=fax) mayor.add_role('Mayor', term) mayor.add_source(url) self.save_object(mayor) council_url = 'http://philadelphiacitycouncil.net/council-members/' doc = lxml.html.fromstring(self.urlopen(council_url)) doc.make_links_absolute(council_url) urls = set(doc.xpath('//a[contains(@href, "/council-members/council")]/@href')) assert len(urls) <= 17, 'expected 17 unique councilmember URLs, found %d' % len(urls) for url in urls: doc = lxml.html.fromstring(self.urlopen(url)) doc.make_links_absolute(url) optional = dict() # fields not all legislators will have full_name = [] first_name = '' middle_name = '' last_name = '' suffixes = '' roles = [] lines = [] lines_office2 = [] has_office2 = bool(False) reached_contact_form = bool(False) phone1 = None phone1_office2 = None phone2 = None phone2_office2 = None fax = None fax_office2 = None office_name = None district = 'At-Large' # default photo_url = ( doc.xpath('//img[contains(@title, "brian picture")]/@src') or # Special case for BRIAN J. O’NEILL doc.xpath('//img[contains(@class, "size-full")]/@src') or doc.xpath('//img[contains(@class, "size-medium")]/@src') or doc.xpath('//img[contains(@class, "size-thumbnail")]/@src') )[0] # That's an en dash, not a hyphen. parts = re.split(u'[,–]', doc.xpath('//h3/text()')[0]) for index, part in enumerate(filter(None, parts)): part = clean_string(part) if index == 0: if 'Councilman' in part: optional['gender'] = 'Male' elif 'Councilwoman' in part: optional['gender'] = 'Female' elif 'Council President' in part: roles.append('Council President') part = re.sub('^Council(?:man|woman| President)\s+', '', part) full_name.append(part) first_name, middle_name, last_name = parse_full_name(full_name[0]) elif part in ('Jr.', 'Sr.'): full_name.append(part) suffixes = part elif 'District' in part: district = part else: roles.append(part) full_name = ', '.join(full_name) contact_url = doc.xpath('//a[text()="Contact"]/@href')[0] doc = lxml.html.fromstring(self.urlopen(contact_url)) doc.make_links_absolute(contact_url) # @todo email, personal_url are sometimes in another paragraph. parts = doc.xpath('//div[@class="post-entry"]//text()') parts = map(clean_string, parts) consuming_address_lines = bool(False) for part in filter(None, parts): # Special case for Curtis Jones Jr. if re.match(r'^Local Office:', part): consuming_address_lines = True has_office2 = True office_name = 'Local Office' if re.match(r'City Hall Office', part) or re.match(r'^Hours', part) or re.match(r'.*facebook', part) or re.match(r'.*twitter', part) or reached_contact_form: continue elif re.match(r'^Contact Council.*man', part) or re.match(r'^Contact CMAL', part): reached_contact_form = True continue elif re.match(r'^City Hall.+Room', part): consuming_address_lines = True lines.append(part) elif re.match(r'^FAX:', part, re.I) or re.match(r'^F:', part, re.I): consuming_address_lines = False if has_office2 and fax_office2 == None: fax_office2 = '-'.join(tel_regex.search(part).groups()) elif fax == None: fax = '-'.join(tel_regex.search(part).groups()) elif tel_regex.search(part): consuming_address_lines = False if has_office2 and phone1_office2 == None and phone2_office2 == None: phone1_office2, phone2_office2 = parse_phones(part) elif phone1 == None and phone2 == None: phone1, phone2 = parse_phones(part) elif '@' in part: consuming_address_lines = False optional['email'] = re.search('\S+@\S+', part).group() elif re.match(r'^Neighborhood Office.*', part): consuming_address_lines = False has_office2 = True elif re.match(r'.*Office.*', part) or re.match(r'.*Heroes Hall.*', part): # Special case for Curtis Jones Jr. if re.match(r'.*Local Office.*', part): continue if len(lines_office2) > 0: consuming_address_lines = False else: consuming_address_lines = True office_name = string.strip(part, ':;,.') elif consuming_address_lines: if has_office2: lines_office2.append(cleanup_address(part, False)) else: lines.append(cleanup_address(part)) elif re.match(r'^(?:, )?Philadelphia, PA(?: 19107(?:-3290)?)?$', part): pass else: self.logger.warning('Skipped: ' + part) # Some Councilmembers have no zip code or only a 5-digit zip code. # All that changes between them is a room number. address = '\n'.join(lines) address_office2 = '\n'.join(lines_office2) legislator = Legislator(term, 'upper', district, full_name, first_name, last_name, middle_name, suffixes=suffixes, url=url, photo_url=photo_url, party=None) legislator.update(optional) if re.search('.*\S.*', address): legislator.add_office('capitol', 'City Hall Office', address=address, phone=phone1, secondary_phone=phone2, fax=fax) if re.search('.*\S.*', address_office2): legislator.add_office('district', office_name, address=address_office2, phone=phone1_office2, secondary_phone=phone2_office2, fax=fax_office2) legislator.add_source(url) for role in roles: legislator.add_role(role, term) self.save_legislator(legislator)
def parse_legislator(self, tr, term, chamber, strip=methodcaller('strip'), xpath='td[contains(@class, "views-field-field-%s-%s")]%s', xp={'url': ('lname-value-1', '/a/@href'), 'district': ('district-value', '/text()'), 'party': ('party-value', '/text()'), 'full_name': ('feedbackurl-value', '/a/text()'), 'address': ('feedbackurl-value', '/p/text()')}, titles={'upper': 'senator', 'lower': 'member'}, funcs={ 'full_name': lambda s: s.replace('Contact Senator', '').strip(), 'address': parse_address, }): ''' Given a tr element, get specific data from it. ''' rubberstamp = lambda _: _ tr_xpath = tr.xpath res = {} for k, v in xp.items(): f = funcs.get(k, rubberstamp) v = (titles[chamber],) + v v = map(f, map(strip, tr_xpath(xpath % v))) if len(v) == 1: res[k] = v[0] else: res[k] = v # Photo. try: res['photo_url'] = tr_xpath('td/p/img/@src')[0] except IndexError: pass # Addresses. addresses = res['address'] try: addresses = map(dict, filter(None, addresses)) except ValueError: # Sometimes legislators only have one address, in which # case this awful hack is helpful. addresses = map(dict, filter(None, [addresses])) for x in addresses: try: x['zip'] = x['zip'].replace('CA ', '') except KeyError: # No zip? Toss. addresses.remove(x) # Re-key the addresses addresses[0].update(type='capitol', name='Capitol Office') offices = [addresses[0]] for office in addresses[1:]: office.update(type='district', name='District Office') offices.append(office) for office in offices: street = office['street'] street = '%s\n%s, %s %s' % (street, office['city'], 'CA', office['zip']) office['address'] = street office['fax'] = None office['email'] = None del office['street'], office['city'], office['zip'] res['offices'] = offices del res['address'] # Remove junk from assembly member names. junk = 'Contact Assembly Member ' res['full_name'] = res['full_name'].replace(junk, '') # convert party if res['party'] == 'Democrat': res['party'] = 'Democratic' # strip leading zero res['district'] = str(int(res['district'])) # Add a source for the url. leg = Legislator(term, chamber, **res) leg.update(**res) return leg
def scrape(self, chamber, term): for tdata in self.metadata['terms']: if term == tdata['name']: year = tdata['start_year'] session_number = tdata['session_number'] break # Fetch the csv. url = 'http://leg.mt.gov/content/sessions/%s/%d%sMembers.txt' % \ (session_number, year, chamber == 'upper' and 'Senate' or 'House') # Parse it. data = self.urlopen(url) data = data.replace('"""', '"') # weird triple quotes data = data.splitlines() fieldnames = ['last_name', 'first_name', 'party', 'district', 'address', 'city', 'state', 'zip'] csv_parser = csv.DictReader(data, fieldnames) district_leg_urls = self._district_legislator_dict() # Toss the row headers. next(csv_parser) for entry in csv_parser: if not entry: continue # City. entry['city'] = entry['city'] # Address. entry['address'] = entry['address'] # District. district = entry['district'] hd_or_sd, district = district.split() del entry['district'] # Party. party_letter = entry['party'] party = {'D': 'Democratic', 'R': 'Republican'}[party_letter] entry['party'] = party del entry['party'] # Get full name properly capped. fullname = _fullname = '%s %s' % (entry['first_name'].capitalize(), entry['last_name'].capitalize()) city_lower = entry['city'].lower() # Get any info at the legislator's detail_url. detail_url = district_leg_urls[hd_or_sd][district] # Get the office. address = '\n'.join([ entry['address'], '%s, %s %s' % (entry['city'], entry['state'], entry['zip']) ]) office = dict( name='District Office', type='district', phone=None, fax=None, email=None, address=address) deets = self._scrape_details(detail_url) # Add the details and delete junk. entry.update(deets) del entry['first_name'], entry['last_name'] legislator = Legislator(term, chamber, district, fullname, party=party) legislator.update(entry) legislator.add_source(detail_url) legislator.add_source(url) legislator['url'] = detail_url office['phone'] = deets.get('phone') office['fax'] = deets.get('fax') legislator.add_office(**office) self.save_legislator(legislator)
def parse_legislator( self, tr, term, chamber, strip=methodcaller('strip'), xpath='td[contains(@class, "views-field-field-%s-%s")]%s', xp={ 'url': ('lname-value-1', '/a/@href'), 'district': ('district-value', '/text()'), 'party': ('party-value', '/text()'), 'full_name': ('feedbackurl-value', '/a/text()'), 'address': ('feedbackurl-value', '/p/text()') }, titles={ 'upper': 'senator', 'lower': 'member' }, funcs={ 'full_name': lambda s: s.replace('Contact Senator', '').strip(), 'address': parse_address, }): ''' Given a tr element, get specific data from it. ''' rubberstamp = lambda _: _ tr_xpath = tr.xpath res = {} for k, v in xp.items(): f = funcs.get(k, rubberstamp) v = (titles[chamber], ) + v v = map(f, map(strip, tr_xpath(xpath % v))) if len(v) == 1: res[k] = v[0] else: res[k] = v # Photo. try: res['photo_url'] = tr_xpath('td/p/img/@src')[0] except IndexError: pass # Addresses. addresses = res['address'] try: addresses = map(dict, filter(None, addresses)) except ValueError: # Sometimes legislators only have one address, in which # case this awful hack is helpful. addresses = map(dict, filter(None, [addresses])) for x in addresses: try: x['zip'] = x['zip'].replace('CA ', '') except KeyError: # No zip? Toss. addresses.remove(x) # Re-key the addresses addresses[0].update(type='capitol', name='Capitol Office') offices = [addresses[0]] for office in addresses[1:]: office.update(type='district', name='District Office') offices.append(office) for office in offices: street = office['street'] street = '%s\n%s, %s %s' % (street, office['city'], 'CA', office['zip']) office['address'] = street office['fax'] = None office['email'] = None del office['street'], office['city'], office['zip'] res['offices'] = offices del res['address'] # Remove junk from assembly member names. junk = 'Contact Assembly Member ' res['full_name'] = res['full_name'].replace(junk, '') # convert party if res['party'] == 'Democrat': res['party'] = 'Democratic' # strip leading zero res['district'] = str(int(res['district'])) # Add a source for the url. leg = Legislator(term, chamber, **res) leg.update(**res) return leg
def parse_assembly(self, tr, term, chamber): ''' Given a tr element, get specific data from it. ''' strip = methodcaller('strip') xpath = 'td[contains(@class, "views-field-field-%s-%s")]%s' xp = { 'url': [('lname-value-1', '/a/@href'), ('member-lname-value-1', '/a/@href')], 'district': [('district-value', '/text()')], 'party': [('party-value', '/text()')], 'full_name': [('feedbackurl-value', '/a/text()')], 'address': [('feedbackurl-value', '/p/text()'), ('feedbackurl-value', '/p/font/text()')] } titles = {'upper': 'senator', 'lower': 'member'} funcs = { 'full_name': lambda s: s.replace('Contact Senator', '').strip(), 'address': parse_address, } rubberstamp = lambda _: _ tr_xpath = tr.xpath res = collections.defaultdict(list) for k, xpath_info in xp.items(): for vals in xpath_info: f = funcs.get(k, rubberstamp) vals = (titles[chamber],) + vals vals = map(f, map(strip, tr_xpath(xpath % vals))) res[k].extend(vals) # Photo. try: res['photo_url'] = tr_xpath('td/p/img/@src')[0] except IndexError: pass # Addresses. addresses = res['address'] try: addresses = map(dict, filter(None, addresses)) except ValueError: # Sometimes legislators only have one address, in which # case this awful hack is helpful. addresses = map(dict, filter(None, [addresses])) for address in addresses[:]: # Toss results that don't have required keys. if not set(['street', 'city', 'zip']) < set(address): if address in addresses: addresses.remove(address) # Re-key the addresses offices = [] if addresses: # Mariko Yamada's addresses wouldn't parse correctly as of # 3/23/2013, so here we're forced to test whether any # addresses were even found. addresses[0].update(type='capitol', name='Capitol Office') offices.append(addresses[0]) for office in addresses[1:]: office.update(type='district', name='District Office') offices.append(office) for office in offices: street = office['street'] street = '%s\n%s, %s %s' % (street, office['city'], 'CA', office['zip']) office['address'] = street office['fax'] = None office['email'] = None del office['street'], office['city'], office['zip'] res['offices'] = offices del res['address'] # Remove junk from assembly member names. junk = 'Contact Assembly Member ' try: res['full_name'] = res['full_name'].pop().replace(junk, '') except IndexError: return # Normalize party. for party in res['party'][:]: if party: if party == 'Democrat': party = 'Democratic' res['party'] = party break else: res['party'] = None # Mariko Yamada also didn't have a url that lxml would parse # as of 3/22/2013. if res['url']: res['url'] = res['url'].pop() else: del res['url'] # strip leading zero res['district'] = str(int(res['district'].pop())) # Add a source for the url. leg = Legislator(term, chamber, **res) leg.update(**res) return leg
def scrape(self, term, chambers): # The mayor doesn't sit on council. url = 'http://www.phila.gov/' doc = lxml.html.fromstring(self.urlopen(url)) doc.make_links_absolute(url) # The mayor's name doesn't appear on the mayor's page! name = re.search('Mayor (.+)', doc.xpath('//title/text()')[0].strip()).group(1) mayor = Person(name) mayor.add_source(url) url = 'http://www.phila.gov/mayor/' doc = lxml.html.fromstring(self.urlopen(url)) doc.make_links_absolute(url) lines = map(clean_string, doc.xpath('//div[contains(text(),"Mailing Address")]/following-sibling::text()')[1:]) address = '\n'.join(lines) phone = '-'.join(tel_regex.search(doc.xpath('//strong[contains(text(),"Phone")]/following-sibling::text()[1]')[0]).groups()) fax = '-'.join(tel_regex.search(doc.xpath('//strong[contains(text(),"Fax")]/following-sibling::text()[1]')[0]).groups()) email = clean_string(doc.xpath('//strong[contains(text(),"Email")]/following-sibling::text()[1]')[0]) mayor.update(dict(url=url, email=email)) mayor.add_office('capitol', 'Office of the Mayor', address=address, phone=phone, fax=fax) mayor.add_role('Mayor', term) mayor.add_source(url) self.save_object(mayor) council_url = 'http://philadelphiacitycouncil.net/council-members/' doc = lxml.html.fromstring(self.urlopen(council_url)) doc.make_links_absolute(council_url) urls = set(doc.xpath('//a[contains(@href, "/council-members/council")]/@href')) assert len(urls) <= 17, 'expected 17 unique councilmember URLs, found %d' % len(urls) for url in urls: doc = lxml.html.fromstring(self.urlopen(url)) doc.make_links_absolute(url) optional = dict() # fields not all legislators will have name = [] roles = [] lines = [] phone1 = None phone2 = None fax = None district = 'At-Large' # default photo_url = ( doc.xpath('//img[contains(@class, "size-full")]/@src') or doc.xpath('//img[contains(@class, "size-medium")]/@src') or doc.xpath('//img[contains(@class, "size-thumbnail")]/@src') )[0] # That's an en dash, not a hyphen. parts = re.split(u'[,–]', doc.xpath('//h3/text()')[0]) for index, part in enumerate(filter(None, parts)): part = clean_string(part) if index == 0: if 'Councilman' in part: optional['gender'] = 'Male' elif 'Councilwoman' in part: optional['gender'] = 'Female' elif 'Council President' in part: roles.append('Council President') part = re.sub('^Council(?:man|woman| President)\s+', '', part) name.append(part) elif part in ('Jr.', 'Sr.'): name.append(part) elif 'District' in part: district = part else: roles.append(part) name = ', '.join(name) contact_url = doc.xpath('//a[text()="Contact"]/@href')[0] doc = lxml.html.fromstring(self.urlopen(contact_url)) doc.make_links_absolute(contact_url) # @todo email, second office, personal_url are sometimes in another paragraph. if len(doc.xpath('//div[@class="post-entry"]/p')) > 1: self.logger.warning('Skipped paragraphs:\n' + '\n'.join(lxml.html.tostring(html) for html in doc.xpath('//div[@class="post-entry"]/p[position()>1]'))) parts = doc.xpath('//div[@class="post-entry"]/p[position()=1]//text()') or doc.xpath('//div[@class="post-entry"]//text()') parts = map(clean_string, parts) for part in filter(None, parts): if re.match(r'^City Hall', part): lines.append('City Hall, Room %s' % re.search('Room (\d+)', part).group(1)) elif re.match(r'^FAX:', part, re.I): fax = '-'.join(tel_regex.search(part).groups()) elif tel_regex.search(part): if phone1: self.logger.warning('Already have phone numbers for one office: ' + part) else: phones = tel_regex.findall(part) phone1 = '-'.join(phones[0]) if len(phones) == 2: phone2 = '-'.join(phones[1]) else: phone2 = phone1[:8] + re.search(r'(?: or |/)(\d{4})$', parts[2]).group(1) elif '@' in part: optional['email'] = re.search('\S+@\S+', part).group() elif re.match(r'^(?:, )?Philadelphia, PA(?: 19107(?:-3290)?)?$', part): pass else: # @todo second office is sometimes in the same paragraph. self.logger.warning('Skipped: ' + part) # Some Councilmembers have no zip code or only a 5-digit zip code. # All that changes between them is a room number. lines.append('Philadelphia, PA 19107-3290') address = '\n'.join(lines) legislator = Legislator(term, 'upper', district, name, url=url, photo_url=photo_url, party=None) legislator.update(optional) legislator.add_office('capitol', 'Council Office', address=address, phone=phone1, secondary_phone=phone2, fax=fax) legislator.add_source(url) for role in roles: legislator.add_role(role, term) self.save_legislator(legislator)