def scrape_lower_chamber(self, term): # E-mail contact is now hidden behind webforms. Sadness. party_map = {'PNP': 'Partido Nuevo Progresista', 'PPD': u'Partido Popular Democr\xe1tico', 'PIP': u'Partido Independentista Puertorrique\u00F1o', } url = 'http://www.tucamarapr.org/dnncamara/ComposiciondelaCamara.aspx' page = self.lxmlize(url) member_nodes = self.get_nodes( page, '//div[@class="info-block"][1]//a[@class="opener"]') if member_nodes is not None: for member_node in member_nodes: # Initialize default values for legislator attributes. name = None district = None address = None party = None photo_url = None phone = None fax = None photo_url = self.get_node( member_node, './/span[@class="identity"]/img/@src') # Node reference for convenience. info_node = self.get_node( member_node, './/span[@class="info"]') name_node = self.get_node( info_node, './/span[@class="name"]') # Strip titles from legislator name. if name_node is not None: name_text = name_node.text.strip() name_text = re.sub(r'^Hon\.[\s]*', '', name_text) name_text = re.sub(r' - .*$', '', name_text) name = ' '.join(name_text.split()) party_node = self.get_node( info_node, './/span[@class="party"]/span') if party_node is not None: party_text = party_node.text.strip() party = party_map[party_text] district_node = self.get_node( info_node, './/span[@class="district"]') if district_node is not None: district_text = district_node.text.strip() try: # district_number = re.search(r'0?(\d{1,2})', # district_text).group(1) district = re.sub(r'^Distrito[\s]*', '', district_text).strip() except AttributeError: if "Distrito" not in district_text: district = 'At-Large' else: warning = u'{} missing district number.' self.warning(warning.format(name)) address_node = self.get_node( info_node, './/span[@class="address"]') if address_node is not None: address_text = address_node.text if address_text and not address_text.isspace(): address = address_text.strip() # Only grabs the first validated phone number found. # Typically, representatives have multiple phone numbers. phone_nodes = self.get_nodes( member_node, './/span[@class="two-columns"]//span[@class="data-type"' 'and contains(text(), "Tel:")]') if phone_nodes is not None: has_valid_phone = False for phone_node in phone_nodes: # Don't keep searching phone numbers if a good # one is found. if has_valid_phone: break phone_text = phone_node.text phone_text = re.sub(r'^Tel:[\s]*', '', phone_text)\ .strip() if self.validate_phone_number(phone_text): phone = phone_text has_valid_phone = True fax_node = self.get_node( member_node, './/span[@class="two-columns"]//span[@class="data-type"' ' and contains(text(), "Fax:")]') if fax_node is not None: fax_text = fax_node.text fax_text = re.sub(r'^Fax:[\s]*', '', fax_text).strip() if self.validate_phone_number(fax_text): fax = fax_text person = Person(primary_org='lower', district=district, name=name, party=party, image=photo_url) person.add_link(url) person.add_source(url) if address: person.add_contact_deatil(type='address', value=address, note='capitol Office') if phone: person.add_contact_deatil(type='voice', value=phone, note='capitol Office') if fax: person.add_contact_deatil(type='fax', value=fax, note='capitol Office') yield person
def scrape_upper_chamber(self, term): urls = { 'At-Large': 'http://www.senadopr.us/Pages/SenadoresporAcumulacion.aspx', 'I': 'http://www.senadopr.us/Pages/Senadores%20Distrito%20I.aspx', 'II': 'http://www.senadopr.us/Pages/Senadores%20Distrito%20II.aspx', 'III': 'http://www.senadopr.us/Pages/Senadores%20Distrito%20III.aspx', 'IV': 'http://www.senadopr.us/Pages/Senadores%20Distrito%20IV.aspx', 'V': 'http://www.senadopr.us/Pages/Senadores%20Distrito%20V.aspx', 'VI': 'http://www.senadopr.us/Pages/Senadores%20Distrito%20VI.aspx', 'VII': 'http://www.senadopr.us/Pages/Senadores%20Distrito%20VII.aspx', 'VIII': 'http://www.senadopr.us/Pages/Senadores%20Distrito%20VIII.aspx' } for district, url in urls.items(): leg_page_html = self.get(url).text doc = lxml.html.fromstring(leg_page_html) doc.make_links_absolute(url) rows = doc.xpath('//table[@summary="Senadores 2013-2016"]' '/tr[not(@class="ms-viewheadertr")]') for row in rows: tds = row.xpath('td') name = tds[0].text_content().title().replace('Hon.', '', 1).strip() party = tds[1].text_content() phone = tds[2].text_content() email = tds[3].text_content() # Code to guess the picture # Those middle names abbreviations are sometimes weird. namefixed = str(name.replace(".", ". ")) # Remove the accents namefixed = unicodedata.normalize('NFKD', namefixed).encode( 'ascii', 'ignore') nameparts = namefixed.split() if nameparts[1].endswith('.'): lastname = nameparts[2] else: lastname = nameparts[1] # Construct the photo url photo_url = 'http://www.senadopr.us/Fotos%20Senadores/sen_' + \ (nameparts[0][0] + lastname).lower() + '.jpg' try: self.head( photo_url) # Checking to see if the file is there except scrapelib.HTTPError: # If not, leave out the photo_url photo_url = '' person = Person(primary_org='upper', district=district, name=name, party=party, image=photo_url) if email: person.add_contact_deatil(type='email', value=email, note='Capitol Office') if phone: person.add_contact_deatil(type='voice', value=phone, note='Capitol Office') person.add_link(url) person.add_source(url) yield person
def scrape_upper_chamber(self, term): urls = { 'At-Large': 'http://www.senadopr.us/Pages/SenadoresporAcumulacion.aspx', 'I': 'http://www.senadopr.us/Pages/Senadores%20Distrito%20I.aspx', 'II': 'http://www.senadopr.us/Pages/Senadores%20Distrito%20II.aspx', 'III': 'http://www.senadopr.us/Pages/Senadores%20Distrito%20III.aspx', 'IV': 'http://www.senadopr.us/Pages/Senadores%20Distrito%20IV.aspx', 'V': 'http://www.senadopr.us/Pages/Senadores%20Distrito%20V.aspx', 'VI': 'http://www.senadopr.us/Pages/Senadores%20Distrito%20VI.aspx', 'VII': 'http://www.senadopr.us/Pages/Senadores%20Distrito%20VII.aspx', 'VIII': 'http://www.senadopr.us/Pages/Senadores%20Distrito%20VIII.aspx' } for district, url in urls.items(): leg_page_html = self.get(url).text doc = lxml.html.fromstring(leg_page_html) doc.make_links_absolute(url) rows = doc.xpath('//table[@summary="Senadores 2013-2016"]' '/tr[not(@class="ms-viewheadertr")]') for row in rows: tds = row.xpath('td') name = tds[0].text_content().title().replace('Hon.', '', 1).strip() party = tds[1].text_content() phone = tds[2].text_content() email = tds[3].text_content() # Code to guess the picture # Those middle names abbreviations are sometimes weird. namefixed = str(name.replace(".", ". ")) # Remove the accents namefixed = unicodedata.normalize('NFKD', namefixed).encode('ascii', 'ignore') nameparts = namefixed.split() if nameparts[1].endswith('.'): lastname = nameparts[2] else: lastname = nameparts[1] # Construct the photo url photo_url = 'http://www.senadopr.us/Fotos%20Senadores/sen_' + \ (nameparts[0][0] + lastname).lower() + '.jpg' try: self.head(photo_url) # Checking to see if the file is there except scrapelib.HTTPError: # If not, leave out the photo_url photo_url = '' person = Person(primary_org='upper', district=district, name=name, party=party, image=photo_url) if email: person.add_contact_deatil(type='email', value=email, note='Capitol Office') if phone: person.add_contact_deatil(type='voice', value=phone, note='Capitol Office') person.add_link(url) person.add_source(url) yield person
def scrape_lower_chamber(self, term): # E-mail contact is now hidden behind webforms. Sadness. party_map = { 'PNP': 'Partido Nuevo Progresista', 'PPD': u'Partido Popular Democr\xe1tico', 'PIP': u'Partido Independentista Puertorrique\u00F1o', } url = 'http://www.tucamarapr.org/dnncamara/ComposiciondelaCamara.aspx' page = self.lxmlize(url) member_nodes = self.get_nodes( page, '//div[@class="info-block"][1]//a[@class="opener"]') if member_nodes is not None: for member_node in member_nodes: # Initialize default values for legislator attributes. name = None district = None address = None party = None photo_url = None phone = None fax = None photo_url = self.get_node( member_node, './/span[@class="identity"]/img/@src') # Node reference for convenience. info_node = self.get_node(member_node, './/span[@class="info"]') name_node = self.get_node(info_node, './/span[@class="name"]') # Strip titles from legislator name. if name_node is not None: name_text = name_node.text.strip() name_text = re.sub(r'^Hon\.[\s]*', '', name_text) name_text = re.sub(r' - .*$', '', name_text) name = ' '.join(name_text.split()) party_node = self.get_node(info_node, './/span[@class="party"]/span') if party_node is not None: party_text = party_node.text.strip() party = party_map[party_text] district_node = self.get_node(info_node, './/span[@class="district"]') if district_node is not None: district_text = district_node.text.strip() try: # district_number = re.search(r'0?(\d{1,2})', # district_text).group(1) district = re.sub(r'^Distrito[\s]*', '', district_text).strip() except AttributeError: if "Distrito" not in district_text: district = 'At-Large' else: warning = u'{} missing district number.' self.warning(warning.format(name)) address_node = self.get_node(info_node, './/span[@class="address"]') if address_node is not None: address_text = address_node.text if address_text and not address_text.isspace(): address = address_text.strip() # Only grabs the first validated phone number found. # Typically, representatives have multiple phone numbers. phone_nodes = self.get_nodes( member_node, './/span[@class="two-columns"]//span[@class="data-type"' 'and contains(text(), "Tel:")]') if phone_nodes is not None: has_valid_phone = False for phone_node in phone_nodes: # Don't keep searching phone numbers if a good # one is found. if has_valid_phone: break phone_text = phone_node.text phone_text = re.sub(r'^Tel:[\s]*', '', phone_text)\ .strip() if self.validate_phone_number(phone_text): phone = phone_text has_valid_phone = True fax_node = self.get_node( member_node, './/span[@class="two-columns"]//span[@class="data-type"' ' and contains(text(), "Fax:")]') if fax_node is not None: fax_text = fax_node.text fax_text = re.sub(r'^Fax:[\s]*', '', fax_text).strip() if self.validate_phone_number(fax_text): fax = fax_text person = Person(primary_org='lower', district=district, name=name, party=party, image=photo_url) person.add_link(url) person.add_source(url) if address: person.add_contact_deatil(type='address', value=address, note='capitol Office') if phone: person.add_contact_deatil(type='voice', value=phone, note='capitol Office') if fax: person.add_contact_deatil(type='fax', value=fax, note='capitol Office') yield person