def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//h1[@class="title"]') for councillor in councillors: if not ',' in councillor.text_content(): continue name, district = councillor.text_content().split(',') name = name.strip() if 'Mayor' in district: p = Legislator(name=name, post_id='Beaconsfield', role='Maire') p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('./parent::div/parent::div/p//img/@src')[0] phone = councillor.xpath('.//parent::div/following-sibling::div[contains(text(), "514")]/text()')[0] phone = phone.split(':')[1].strip().replace(' ', '-') p.add_contact('voice', phone, 'legislature') script = councillor.xpath('.//parent::div/following-sibling::div/script')[0].text_content() p.add_contact('email', get_email(script), None) yield p continue district = district.split('-')[1].strip() p = Legislator(name=name, post_id=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('./parent::div/parent::div/p//img/@src')[0] phone = councillor.xpath('.//parent::div/following-sibling::p[contains(text(), "514")]/text()') if phone: phone = phone[0] phone = phone.split(':')[1].strip().replace(' ', '-') p.add_contact('voice', phone, 'legislature') script = councillor.xpath('.//parent::div/following-sibling::p/script')[0].text_content() p.add_contact('email', get_email(script), None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillor_trs = [tr for tr in page.xpath('//table//tr[1]') if len(tr) == 2][:-1] for councillor_tr in councillor_trs: desc = [text.strip() for text in councillor_tr.xpath('.//text()[normalize-space()]') if text.strip()] if len(desc) == 3: role = 'Maire' district = u'Saint-Jérôme' else: role = 'Conseiller' district = desc[0].replace(u'numéro ', '') name = desc[-3] phone = desc[-2] email = desc[-1] image = councillor_tr.xpath('string(.//img/@src)')[0] p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.image = image p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillor_trs = [ tr for tr in page.xpath('//table//tr[1]') if len(tr) == 2 ][:-1] for councillor_tr in councillor_trs: desc = [ text.strip() for text in councillor_tr.xpath('.//text()[normalize-space()]') if text.strip() ] if len(desc) == 3: role = 'Maire' district = u'Saint-Jérôme' else: role = 'Conseiller' district = desc[0].replace(u'numéro ', '') name = desc[-3] phone = desc[-2] email = desc[-1] image = councillor_tr.xpath('string(.//img/@src)')[0] p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.image = image p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//p[@class="WSIndent"]/a') for councillor in councillors: district = re.findall(r'(Ward [0-9]{1,2})', councillor.text_content()) if district: district = district[0] name = councillor.text_content().replace(district, '').strip() role = 'Councillor' else: district = 'Kawartha Lakes' name = councillor.text_content().replace('Mayor', '').strip() role = 'Mayor' url = councillor.attrib['href'] page = lxmlize(url) email = page.xpath('//a[contains(@href, "mailto:")]/@href')[0].rsplit(':', 1)[1].strip() image = page.xpath('//img[@class="image-right"]/@src')[0] p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('email', email, None) p.image = image yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@align="center" and not(@class="background")]//td/p') for councillor in councillors: if not councillor.text_content().strip(): continue name = councillor.xpath('./font/b/text()') if not name: name = councillor.xpath('./font/text()') if 'e-mail' in name[0]: name = councillor.xpath('./b/font/text()') name = name[0] role = 'Councillor' if 'Mayor' in name: name = name.replace('Mayor', '') role = 'Mayor' p = Legislator(name=name, post_id="LaSalle", role=role) p.add_source(COUNCIL_PAGE) photo_url = councillor.xpath('./parent::td//img/@src')[0] p.image = photo_url email = councillor.xpath('.//a[contains(@href, "mailto:")]/text()')[0] p.add_contact('email', email, None) phone = re.findall(r'(?<=phone:)(.*)(?=home)', councillor.text_content(), flags=re.DOTALL) if phone: p.add_contact('voice', phone[0].strip(), 'legislature') home_phone = re.findall(r'(?<=home phone:)(.*)', councillor.text_content(), flags=re.DOTALL)[0] p.add_contact('voice', home_phone.strip(), 'residence') yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//table/tbody/tr/td') for councillor in councillors: text = councillor.xpath('.//strong/text()')[0] name = text.split(',')[0].replace('Name:', '').strip() if 'Mayor' in text and not 'Deputy Mayor' in text: role = 'Mayor' district = 'Fredericton' else: district = re.findall(r'(Ward:.*)(?=Address:)', councillor.text_content())[0].replace(':', '').strip() district = re.search('\((.+?)(?: Area)?\)', district).group(1) role = 'Councillor' p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('.//img/@src')[0] address = re.findall(r'(?<=Address:).*(?=Home:)', councillor.text_content())[0].strip() p.add_contact('address', address, 'legislature') phone = re.findall(r'(?<=Home: \().*(?=Fax:)', councillor.text_content())[0] phone = re.sub(r'(?<=[0-9])(\)\D{1,2})(?=[0-9])', '-', phone).split()[0] p.add_contact('voice', phone, 'residence') phone = re.findall(r'(?<=Office: \().*(?=Fax:)', councillor.text_content()) if phone: phone = phone[0].replace(') ', '-') p.add_contact('voice', phone, 'legislature') yield p
def scrape_mayor(url): page = lxmlize(url) name = page.xpath('//tr/td/p')[-1] name = name.text_content().replace('Mayor', '') image = page.xpath('//div[@class="sask_ArticleBody"]//img/@src')[0] contact_url = page.xpath( '//a[contains(text(), "Contact the Mayor")]/@href')[0] page = lxmlize(contact_url) address = ' '.join( page.xpath( '//div[@id="ctl00_PlaceHolderMain_RichHtmlField1__ControlWrapper_RichHtmlField"]/p[4]/text()' )[1:]) phone = page.xpath( '//div[@id="ctl00_PlaceHolderMain_RichHtmlField1__ControlWrapper_RichHtmlField"]/p[5]/span/text()' )[0].replace('(', '').replace(') ', '-') fax = page.xpath( '//div[@id="ctl00_PlaceHolderMain_RichHtmlField1__ControlWrapper_RichHtmlField"]/p[6]/span/text()' )[0].replace('(', '').replace(') ', '-') p = Legislator(name=name, post_id='Saskatoon', role='Mayor') p.add_source(url) p.image = image p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') return p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//ul[@class="subNav top"]/li/ul//li/a') for councillor in councillors: name = councillor.text_content() url = councillor.attrib['href'] page = lxmlize(url) if councillor == councillors[0]: district = 'Ajax' role = 'Mayor' else: district = re.findall(r'Ward.*', page.xpath('//div[@id="printAreaContent"]//h1')[0].text_content())[0].strip() role = page.xpath('//div[@id="printAreaContent"]//h1')[0].text_content() role = re.findall('((Regional)? ?(Councillor))', role)[0][0] p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = page.xpath('//div[@class="intQuicklinksPhoto"]/img/@src')[0] contact_info = page.xpath('//table[@class="datatable"][1]//tr')[1:] for line in contact_info: contact_type = line.xpath('./td')[0].text_content().strip() contact = line.xpath('./td')[1].text_content().strip() if re.match(r'(Phone)|(Fax)|(Email)', contact_type): contact_type = CONTACT_DETAIL_TYPE_MAP[contact_type] p.add_contact(contact_type, contact, None if contact_type == 'email' else 'legislature') else: p.add_link(contact, None) yield p
def scrape_councilor(self, page, h1, url): name = h1.split('Councillor')[1] ward_full = page.xpath('string(//strong[not(@class)])').replace( u'\xa0', u' ') ward_num, ward_name = re.search(r'(Ward \d+) (.+)', ward_full).groups() p = Legislator(name=name, post_id=ward_num, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = page.xpath('string(//main//img/@src)') email = page.xpath('string((//a[contains(@href, "@")])[1])') p.add_contact('email', email, None) addr_cell = page.xpath('//*[contains(text(), "Toronto City Hall")]/' 'ancestor::td')[0] phone = (addr_cell.xpath( 'string((.//text()[contains(., "Phone:")])[1])').split(':')[1]) p.add_contact('voice', phone, 'legislature') address = '\n'.join(addr_cell.xpath('./p[2]/text()')[:2]) if address: p.add_contact('address', address, 'legislature') return p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="article-content"]//td[@class="ms-rteTableOddCol-0"]') yield scrape_mayor(councillors[0]) for councillor in councillors[1:]: if not councillor.xpath('.//a'): continue name = councillor.xpath('.//a')[0].text_content().strip() district = councillor.xpath('.//a')[1].text_content() url = councillor.xpath('.//a/@href')[0] page = lxmlize(url) p = Legislator(name=name, post_id=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = councillor.xpath('./preceding-sibling::td//img/@src')[-1] contacts = page.xpath('.//td[@class="ms-rteTableOddCol-0"]//text()') for contact in contacts: if re.findall(r'[0-9]', contact): phone = contact.strip().replace(' ', '-') p.add_contact('voice', phone, 'legislature') get_links(p, page.xpath('.//td[@class="ms-rteTableOddCol-0"]')[0]) email = page.xpath( 'string(//a[contains(@href, "mailto:")]/@href)')[len('mailto:'):] p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) mayor = page.xpath('.//div[@class="item-page clearfix"]//table[1]//p')[1] name = mayor.xpath('.//strong/text()')[0] p = Legislator(name=name, post_id='Pointe-Claire', role='Maire') p.add_source(COUNCIL_PAGE) phone = re.findall(r'[0-9]{3}[ -][0-9]{3}-[0-9]{4}', mayor.text_content())[0].replace(' ', '-') p.add_contact('voice', phone, 'legislature') yield p rows = page.xpath('//tr') for i, row in enumerate(rows): if i % 2 == 0: continue councillors = row.xpath('./td') for j, councillor in enumerate(councillors): name = councillor.text_content() # rows[i + 1].xpath('.//td//a[contains(@href, "maps")]/text()')[j] # district number district = rows[i + 1].xpath('.//td/p[1]/text()')[j].replace(' / ', '/') p = Legislator(name=name, post_id=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('.//img/@src')[0] phone = re.findall(r'[0-9]{3}[ -][0-9]{3}-[0-9]{4}', rows[i + 1].xpath('.//td')[j].text_content())[0].replace(' ', '-') p.add_contact('voice', phone, 'legislature') yield p
def scrape_mayor(self, url): page = lxmlize(url) name = page.xpath("//h1/text()")[0].replace("Toronto Mayor", "").strip() p = Legislator(name, post_id="Toronto", role='Mayor') p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = page.xpath('string(//article/img/@src)') url = page.xpath( '//a[contains(text(), "Contact the Mayor")]')[0].attrib['href'] url = url.replace( 'www.', 'www1.' ) # @todo fix lxmlize to use the redirected URL to make links absolute p.add_source(url) page = lxmlize(url) mail_elem, phone_elem = page.xpath('//h3')[:2] address = ''.join(mail_elem.xpath('./following-sibling::p//text()')) phone = phone_elem.xpath('string(./following-sibling::p[1])') p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') return p
def scrape_mayor(url): page = lxmlize(url) name = ' '.join( page.xpath('//div[@id="content"]/p[2]/text()')[0].split()[1:3]) p = Legislator(name=name, post_id='Moncton', role='Mayor') p.add_source(url) p.image = page.xpath('//div[@id="content"]/p[1]/img/@src')[0] info = page.xpath('//table[@class="whiteroundedbox"]//tr[2]/td[1]')[1] address = ', '.join(info.xpath('./p[1]/text()')[1:4]) address = re.sub(r'\s{2,}', ' ', address).strip() phone = info.xpath('.//p[2]/text()')[0].split(':')[1].strip() fax = info.xpath('.//p[2]/text()')[1].split(':')[1].strip() email = info.xpath('.//a/@href')[0].split(':')[1].strip() p.add_contact('address', address, 'legislature') if len(re.sub(r'\D', '', phone)) == 7: phone = '506-%s' % phone p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email, None) return p
def get_people(self): page = lxmlize(COUNCIL_PAGE, 'iso-8859-1') councillors = page.xpath('//div[@id="PageContent"]/table/tbody/tr/td') for councillor in councillors: if not councillor.text_content().strip(): continue if councillor == councillors[0]: district = 'Kirkland' role = 'Maire' else: district = councillor.xpath('.//h2')[0].text_content() district = re.search('- (.+)', district).group(1).strip() district = district.replace(' Ouest', ' ouest').replace(' Est', ' est') role = 'Conseiller' name = councillor.xpath('.//strong/text()')[0] phone = councillor.xpath('.//div[contains(text(), "#")]/text()')[0].replace('T ', '').replace(' ', '-').replace(',-#-', ' x') email = councillor.xpath('.//a[contains(@href, "mailto:")]')[0].text_content() p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) p.image = councillor.xpath('.//img/@src')[0] yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE, user_agent='Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)') yield self.scrape_mayor(page) councillors = page.xpath('//strong[contains(text(), "Councillor")]/parent::p|//b[contains(text(), "Councillor")]/parent::p') for councillor in councillors: name = councillor.xpath('./strong/text()|./b/text()')[0].replace('Councillor', '').strip() district = re.findall('(?<=Ward \d, ).*', councillor.text_content())[0].strip() p = Legislator(name=name, post_id=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('.//img/@src')[0] phone = re.findall(r'Phone(.*)', councillor.text_content()) node = councillor while not phone: node = node.xpath('./following-sibling::p')[1] phone = re.findall(r'Phone(.*)', node.text_content()) phone = phone[0].strip() email = councillor.xpath('.//a[contains(@href, "mailto:")]') if not email: email = councillor.xpath('./following-sibling::p//a[contains(@href, "mailto")]') email = email[0].text_content() if len(re.sub(r'\D', '', phone)) == 7: phone = '902-%s' % phone p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) yield p
def get_people(self): yield mayor_info(MAYOR_PAGE) page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="news"]//p') for councillor in councillors: district = councillor.xpath('./b')[0].text_content() district = re.findall(u'(?:W|R).*', district)[0] role = 'Councillor' if 'Regional' in district: district = 'Cambridge' role = 'Regional Councillor' name = councillor.xpath('.//a')[0].text_content() url = councillor.xpath('.//a')[0].attrib['href'] page = lxmlize(url) image = page.xpath('//img[contains(@src, "councilImages")]/@src')[0] address = page.xpath('//*[contains(text(),"Address")]/ancestor::td')[-1].text_content().split(':')[-1].replace("\t", '') phone = page.xpath('//*[contains(text(),"Tel")]/ancestor::td')[-1].text_content().split(':')[-1].replace("\t", '') phone = phone.replace('(', '').replace(') ', '-') if page.xpath('//*[contains(text(),"Fax")]'): fax = page.xpath('//*[contains(text(),"Fax")]/ancestor::td')[-1].text_content().split(':')[-1].replace("\t", '') fax = fax.replace('(', '').replace(') ', '-') email = page.xpath('//a[contains(@href,"mailto:")]')[0].text_content() p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email, None) p.image = image yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="printArea"]//strong') for councillor in councillors: info = councillor.xpath('./parent::p/text()') if not info: info = councillor.xpath('./parent::div/text()') info = [x for x in info if x.strip()] district = re.sub('(?<=Ward \d).+', '', info.pop(0)) if 'Mayor' in district: district = 'Woolwich' role = 'Mayor' else: district = district.replace('Councillor', '').strip() role = 'Councillor' p = Legislator(name=councillor.text_content(), post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('./img/@src')[0] for contact in info: note, num = contact.split(':') num = num.strip().replace('(', '').replace(') ', '-').replace('extension ', 'x') p.add_contact(note, num, note) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="subnav"]//a') for councillor in councillors: name = councillor.xpath('./span/text()')[0].strip() district = councillor.xpath('.//strong')[0].text_content() url = councillor.attrib['href'] if councillor == councillors[0]: yield self.scrape_mayor(name, url) continue page = lxmlize(url) address = page.xpath('//div[@id="content"]//p[contains(text(),"City of Burlington,")]') contact = page.xpath('//div[@id="subnav"]//p[contains(text(),"Phone")]')[0] phone = re.findall(r'Phone: (.*)', contact.text_content())[0].replace('Ext. ', 'x').replace('#', 'x') fax = re.findall(r'Fax: (.*)', contact.text_content())[0] email = contact.xpath('//a[contains(@href, "mailto:")]')[0].text_content() p = Legislator(name=name, post_id=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = page.xpath('//div[@id="subnav"]//img/@src')[0] if address: p.add_contact('address', address[0].text_content(), 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="c2087"]//a') for councillor in councillors: name = councillor.text_content() url = councillor.attrib['href'] page = lxmlize(url) if 'Maire' in page.xpath('//h2/text()')[0]: district = 'Sherbrooke' role = 'Maire' else: district = page.xpath('//div[@class="csc-default"]//a[@target="_blank"]/text()')[0].replace('district', '').replace('Domaine Howard', 'Domaine-Howard').strip() role = 'Conseiller' if district in ('de Brompton', 'de Lennoxville'): district = district.replace('de ', '') p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = page.xpath('//div[@class="csc-textpic-image csc-textpic-last"]//img/@src')[0] parts = page.xpath('//li[contains(text(), "phone")]/text()')[0].split(':') note = parts[0] phone = parts[1] p.add_contact(note, phone, note) email = page.xpath('//a[contains(@href, "mailto:")]/@href') if email: email = email[0].split(':')[1] p.add_contact('email', email, None) if district == 'Brompton': p.add_extra('boundary_url', '/boundaries/sherbrooke-boroughs/brompton/') elif district == 'Lennoxville': p.add_extra('boundary_url', '/boundaries/sherbrooke-boroughs/lennoxville/') yield p
def scrape_mayor(self, name, url): page = lxmlize(url) contact = page.xpath('//div[@id="secondary align_RightSideBar"]/blockquote/p/text()') phone = contact[0] fax = contact[1] email = page.xpath('//div[@id="secondary align_RightSideBar"]/blockquote/p/a[contains(@href, "mailto:")]/text()')[0] mayor_page = lxmlize('http://www.burlingtonmayor.com') contact_url = mayor_page.xpath('//div[@class="menu"]//a[contains(text(),"Contact")]')[0].attrib['href'] mayor_page = lxmlize(contact_url) address = mayor_page.xpath('//div[@class="entry-content"]//p[contains(text(),"City Hall")]')[0].text_content() p = Legislator(name=name, post_id="Burlington", role='Mayor') p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_source('http://www.burlingtonmayor.com') p.image = page.xpath('//div[@id="secondary align_RightSideBar"]/p/img/@src')[0] p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email, None) p.add_contact('address', address, 'legislature') return p
def councillor_data(url): page = lxmlize(url) name = page.xpath('string(//h1[@id="TitleOfPage"])') district = page.xpath('string(//h2)') # TODO: Councillor emails are built with JS to prevent scraping, but the JS can be scraped. address = page.xpath('string(//div[@class="asideContent"])') photo = page.xpath('string(//div[@id="contentright"]//img[1]/@src)') phone = get_phone_data(page) js = page.xpath('string(//span/script)') email = email_js(js) p = Legislator(name=name, post_id=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) p.image = photo return p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="WebPartWPQ1"]/table/tbody/tr[1]') for councillor in councillors: node = councillor.xpath(".//td[1]//strong//strong//strong//strong") or councillor.xpath(".//td[1]//strong") text = node[0].text_content() name = text.strip().replace("Deputy ", "").replace("Warden ", "").replace("Mayor", "") role = text.replace(name, "").strip() if not role: role = "Councillor" if "," in name: name = name.split(",")[0].strip() district = councillor.xpath('.//td[1]//p[contains(text(),",")]/text()')[0].split(",")[1].strip() district = re.sub(r"\A(?:City|Municipality|Town|Township|Village) of\b| Township\Z", "", district) p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath(".//td[1]//img/@src")[0] info = councillor.xpath(".//td[2]")[0].text_content() residential_info = re.findall(r"(?<=Residence:)(.*)(?=Municipal Office:)", info, flags=re.DOTALL)[0] self.get_contacts(residential_info, "residence", p) municipal_info = re.findall(r"(?<=Municipal Office:)(.*)", info, flags=re.DOTALL)[0] self.get_contacts(municipal_info, "legislature", p) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) mayor_url = page.xpath('//a[contains(text(), "Mayor")]/@href')[0] yield self.scrape_mayor(mayor_url) councillors_url = page.xpath('//a[contains(text(), "Councillors")]/@href')[0] cpage = lxmlize(councillors_url) councillor_rows = cpage.xpath('//tr[td//img]')[:-1] for councillor_row in councillor_rows: img_cell, info_cell = tuple(councillor_row) name = info_cell.xpath( 'string(.//span[contains(text(), "Councillor")])')[len('Councillor '):] district = info_cell.xpath('string(.//p[contains(text(), "District")])') email = info_cell.xpath('string(.//a[contains(@href, "mailto:")])') if not email: email = info_cell.xpath('string(.//strong[contains(text(), "E-mail")]/following-sibling::text())') phone = info_cell.xpath( 'string(.//p[contains(.//text(), "Telephone:")])').split(':')[1] img_url_rel = img_cell.xpath('string(//img/@href)') img_url = urljoin(councillors_url, img_url_rel) p = Legislator(name=name, post_id=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.add_source(councillors_url) p.add_contact('email', email, None) p.add_contact('voice', phone, 'legislature') p.image = img_url yield p
def councillor_data(url): page = lxmlize(url) name = page.xpath('string(//h1[@id="TitleOfPage"])') district = page.xpath('string(//h2)') # TODO: Councillor emails are built with JS to prevent scraping, but the JS can be scraped. address = page.xpath('string(//div[@class="asideContent"])') photo = page.xpath('string(//div[@id="contentright"]//img[1]/@src)') phone = get_phone_data(page) js = page.xpath('string(//span/script)') email = email_js(js) p = Legislator(name=name, post_id=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) p.image = photo return p
def scrape_mayor(self): page = lxmlize(MAYOR_PAGE, 'iso-8859-1') name = page.xpath( '//div[@class="articletitle"]/h1')[0].text_content().replace( 'Mayor', '') p = Legislator(name=name, post_id='Summerside', role='Mayor') p.add_source(MAYOR_PAGE) p.image = page.xpath( '//div[@class="articlebody-inside"]/p/img/@src')[0].replace( '..', '') info = page.xpath('//div[@class="articlebody-inside"]/p') phone = re.findall(r'to (.*)', info[1].text_content())[0] address = info[3].text_content().replace( 'by mail: ', '') + ' ' + info[4].text_content() email = info[5].xpath( './/a[contains(@href, "mailto:")]')[0].text_content() p.add_contact('voice', phone, 'legislature') p.add_contact('address', address, 'legislature') p.add_contact('email', email, None) return p
def get_people(self): page = lxmlize(COUNCIL_PAGE, 'iso-8859-1') general_contacts = page.xpath('//p[@class="large_title"]/following-sibling::p/text()') general_phone = general_contacts[0] general_fax = general_contacts[1] councillors = page.xpath('//tr/td/p/strong') councillors = [councillor for councillor in councillors if not "@" in councillor.text_content()] for councillor in councillors: if 'Mayor' in councillor.text_content(): name = councillor.text_content().replace('Mayor', '') district = 'Dollard-Des Ormeaux' role = 'Maire' else: name = re.split(r'[0-9]', councillor.text_content())[1] district = 'District ' + re.findall(r'[0-9]', councillor.text_content())[0] role = 'Conseiller' p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('./parent::p/parent::td/parent::tr/preceding-sibling::tr//img/@src')[0] email = councillor.xpath('./parent::p/following-sibling::p//a[contains(@href, "mailto:")]') if email: p.add_contact('email', email[0].text_content(), None) p.add_contact('voice', general_phone, 'legislature') p.add_contact('fax', general_fax, 'legislature') yield p
def get_people(self): response = urlopen(COUNCIL_CSV_URL) cr = DictReader(response) for councillor in cr: name = '%s %s' % (councillor['First name'], councillor['Last name']) role = councillor['Elected office'] if role == 'Mayor': district = 'Ottawa' else: district = councillor['District name'] # Correct typos. The City has been notified of the errors. if district == u'Knoxdale Merivale': district = u'Knoxdale-Merivale' if district == u'Rideau Vanier': district = u'Rideau-Vanier' if district == u'Orleans': district = u'Orléans' email = councillor['Email'] address = ', '.join([councillor['Address line 1'], councillor['Address line 2'], councillor['Locality'], councillor['Postal code'], councillor['Province']]) phone = councillor['Phone'] photo_url = councillor['Photo URL'] p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_CSV_URL) p.add_contact('email', email, None) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.image = photo_url yield p
def scrape_mayor(self, div): name = div.xpath('.//a')[0].text_content().replace('Mayor', '') url = div.xpath('.//a')[0].attrib['href'] p = Legislator(name=name, post_id='Guelph', role='Mayor') p.add_source(COUNCIL_PAGE) p.add_source(url) phone = div.xpath('.//text()[normalize-space()]')[2] email = div.xpath('.//a[contains(@href,"mailto:")]')[0].text_content() page = lxmlize(url) p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) p.add_link( page.xpath( '//div[@class="entry-content"]//a[contains(@href, "facebook")]' )[0].attrib['href'], None) p.add_link( page.xpath( '//div[@class="entry-content"]//a[contains(@href, "twitter")]') [0].attrib['href'], None) p.image = page.xpath('//header/img/@src')[0] return p
def scrape_mayor(self, name, url): page = lxmlize(url) contact = page.xpath( '//div[@id="secondary align_RightSideBar"]/blockquote/p/text()') phone = contact[0] fax = contact[1] email = page.xpath( '//div[@id="secondary align_RightSideBar"]/blockquote/p/a[contains(@href, "mailto:")]/text()' )[0] mayor_page = lxmlize('http://www.burlingtonmayor.com') contact_url = mayor_page.xpath( '//div[@class="menu"]//a[contains(text(),"Contact")]' )[0].attrib['href'] mayor_page = lxmlize(contact_url) address = mayor_page.xpath( '//div[@class="entry-content"]//p[contains(text(),"City Hall")]' )[0].text_content() p = Legislator(name=name, post_id="Burlington", role='Mayor') p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_source('http://www.burlingtonmayor.com') p.image = page.xpath( '//div[@id="secondary align_RightSideBar"]/p/img/@src')[0] p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email, None) p.add_contact('address', address, 'legislature') return p
def get_people(self): page = lxmlize(COUNCIL_PAGE) # it's all javascript rendered on the client... wow. js = page.xpath( 'string(//div[@class="inner_container"]/div/script[2])') districts = re.findall(r'arrayDistricts\[a.+"(.+)"', js) members = re.findall(r'arrayMembres\[a.+"(.+)"', js) urls = re.findall(r'arrayLiens\[a.+"(.+)"', js) # first item in list is mayor p = Legislator(name=members[0], post_id='Gatineau', role='Maire') p.add_source(COUNCIL_PAGE) mayor_page = lxmlize(MAYOR_CONTACT_PAGE) p.add_source(MAYOR_CONTACT_PAGE) email = '*****@*****.**' # hardcoded p.add_contact('email', email, None) yield p for district, member, url in zip(districts, members, urls)[1:]: profile_url = COUNCIL_PAGE + '/' + url.split('/')[-1] profile_page = lxmlize(profile_url) photo_url = profile_page.xpath('string(//img/@src)') post_id = 'District ' + re.search('\d+', district).group(0) email = profile_page.xpath( 'string(//a[contains(@href, "mailto:")]/@href)')[len('mailto:' ):] p = Legislator(name=member, post_id=post_id, role='Conseiller') p.add_source(COUNCIL_PAGE) p.add_source(profile_url) p.image = photo_url p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="navMultilevel"]//a') for councillor in councillors: if councillor == councillors[0]: yield self.scrape_mayor(councillor) continue if not '-' in councillor.text_content(): break district, name = councillor.text_content().split(' - ') if name == 'Vacant': continue page = lxmlize(councillor.attrib['href']) address = page.xpath('//div[@class="column last"]//p')[0].text_content() phone = page.xpath('//article[@id="primary"]//*[contains(text(),"Tel")]')[0].text_content() phone = re.findall(r'([0-9].*)', phone)[0].replace(') ', '-') fax = page.xpath('//article[@id="primary"]//*[contains(text(),"Fax")]')[0].text_content() fax = re.findall(r'([0-9].*)', fax)[0].replace(') ', '-') email = page.xpath('//a[contains(@href, "mailto:")]')[0].text_content() p = Legislator(name=name, post_id=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(councillor.attrib['href']) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email, None) p.image = page.xpath('//article[@id="primary"]//img/@src')[1] yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//p[@class="WSIndent"]/a') for councillor in councillors: district = re.findall(r'(Ward [0-9]{1,2})', councillor.text_content()) if district: district = district[0] name = councillor.text_content().replace(district, '').strip() role = 'Councillor' else: district = 'Kawartha Lakes' name = councillor.text_content().replace('Mayor', '').strip() role = 'Mayor' url = councillor.attrib['href'] page = lxmlize(url) email = page.xpath( '//a[contains(@href, "mailto:")]/@href')[0].rsplit( ':', 1)[1].strip() image = page.xpath('//img[@class="image-right"]/@src')[0] p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('email', email, None) p.image = image yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) # it's all javascript rendered on the client... wow. js = page.xpath('string(//div[@class="inner_container"]/div/script[2])') districts = re.findall(r'arrayDistricts\[a.+"(.+)"', js) members = re.findall(r'arrayMembres\[a.+"(.+)"', js) urls = re.findall(r'arrayLiens\[a.+"(.+)"', js) # first item in list is mayor p = Legislator(name=members[0], post_id = 'Gatineau', role='Maire') p.add_source(COUNCIL_PAGE) mayor_page = lxmlize(MAYOR_CONTACT_PAGE) p.add_source(MAYOR_CONTACT_PAGE) email = '*****@*****.**' # hardcoded p.add_contact('email', email, None) yield p for district, member, url in zip(districts, members, urls)[1:]: profile_url = COUNCIL_PAGE + '/' + url.split('/')[-1] profile_page = lxmlize(profile_url) photo_url = profile_page.xpath('string(//img/@src)') post_id = 'District ' + re.search('\d+', district).group(0) email = profile_page.xpath( 'string(//a[contains(@href, "mailto:")]/@href)')[len('mailto:'):] p = Legislator(name=member, post_id=post_id, role='Conseiller') p.add_source(COUNCIL_PAGE) p.add_source(profile_url) p.image = photo_url p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) regions = page.xpath('//*[@id="contentIntleft"]//h3')[1:] for region in regions: #the links in all <p> tags immediately following each <h3> councillors = [ elem[0] for elem in takewhile(lambda elem: elem.tag == 'p', region.xpath('./following-sibling::*')) ] for councillor in councillors: post = re.search('of (.*)', region.text).group(1) p = Legislator(name=councillor.text, post_id=post, role='Councillor') p.add_source(COUNCIL_PAGE) councillor_url = councillor.attrib['href'] p.add_source(councillor_url) email, phone, address, photo_url = councillor_data( councillor_url) p.add_contact('email', email, None) p.add_contact('voice', phone, 'legislature') p.add_contact('address', address, 'legislature') p.image = photo_url yield p chairpage = lxmlize(CHAIR_URL) name = re.search('Chair (.*) -', chairpage.xpath('string(//title)')).group(1) email = chairpage.xpath( 'string(//a[contains(text(), "E-mail")]/@href)') phone = chairpage.xpath( 'string((//span[@class="labelTag"][contains(text(), "Phone")]/parent::*/text())[1])' ).strip(':') address = '\n'.join( chairpage.xpath('//div[@class="contactBody"]//p[1]/text()')) photo_url_src = chairpage.xpath( 'string(//div[@id="contentIntleft"]//img[1]/@src)') photo_url = urljoin(CHAIR_URL, photo_url_src) p = Legislator(name=name, post_id='Waterloo', role='Regional Chair') p.add_source(CHAIR_URL) p.add_contact('email', email, None) p.add_contact('voice', phone, 'legislature') p.add_contact('address', address, 'legislature') p.image = photo_url yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillor_links = page.xpath( '//span[@class="textimagetype"]//a[contains(text(), "- Ward")]') for councillor_link in councillor_links: name, district = councillor_link.text.split(' - ') cpage_url = councillor_link.attrib['href'] cpage = lxmlize(cpage_url) p = Legislator(name=name, post_id=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(cpage_url) email = cpage.xpath('string(//a[contains(@href, "@")])') p.add_contact('email', email, None) phone = cpage.xpath( 'string(//text()[contains(., "Phone")])').split(':')[1] p.add_contact('voice', phone, 'legislature') img_url_rel = cpage.xpath('string((//span/img)[1]/@src)') img_url = urljoin(cpage_url, img_url_rel) p.image = img_url yield p page = lxmlize(MAYOR_PAGE) name = ' '.join( page.xpath('//p[contains(text(), "is married to")]/text()') [0].split()[:2]) address = ' '.join( page.xpath('//p[contains(text(), "Mayor\'s Office")]/text()')[1:]) phone, fax = page.xpath('//p[contains(text(), "Phone:")]/text()')[:-1] phone = phone.strip().replace('(', '').replace(') ', '-') fax = fax.strip().replace('(', '').replace(') ', '-').split(':')[1] email = page.xpath('//a[contains(@href, "mailto:")]/text()')[0] p = Legislator(name=name, post_id='Windsor', role='Mayor') p.add_source(MAYOR_PAGE) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email, None) p.image = page.xpath( '//div[@class="sectioning"]//img[contains(@title, "Mayor")]/@src' )[0] yield p
def get_people(self): root = lxmlize(COUNCIL_PAGE) everyone = root.xpath('//span[@class="Title"]') mayornode = everyone[0] mayor = {} spantext = ' '.join(mayornode.xpath('.//text()')) mayor['name'] = re.search(r'[^(]+', spantext).group(0).strip() mayor['photo_url'] = urljoin(COUNCIL_PAGE, mayornode.xpath('img/@src')[0]) mayor['email'] = mayornode.xpath('following::a[1]/text()')[0] m = Legislator(name=mayor['name'], post_id='Charlottetown', role='Mayor') m.add_source(COUNCIL_PAGE) m.add_contact('email', mayor['email'], None) m.image = mayor['photo_url'] yield m for span in root.xpath('//span[@class="Title"]')[1:]: spantext = ' '.join(span.xpath('.//text()')) header = spantext.replace(u'\u2013', '-').split('-') if len(header) != 2: continue name = header[0].strip() name = name.replace('Councillor', '') name = re.sub(r'\(.+?\)', '', name) name = ' '.join(name.split()) district_name = header[1].strip() district_id = ' '.join(header[1].split()[:2]) # needed a wacky xpath to deal with ward 8 photo = span.xpath('preceding::hr[1]/following::img[1]/@src') photo_url = urljoin(COUNCIL_PAGE, photo[0]) email = span.xpath('string(following::a[1]/text())') p = Legislator(name=name, post_id=district_id, role='Councillor') p.add_source(COUNCIL_PAGE) if email: p.add_contact('email', email, None) p.image = photo_url yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) table_data = page.xpath('//div[@id="litcontentDiv"]//tr') council_data = table_data[2:-1] mayor_row = table_data[0] photo_url_rel = mayor_row.xpath('string(.//img/@src)') photo_url = urljoin(COUNCIL_PAGE, photo_url_rel) contact_node = mayor_row.xpath('./td')[1] name = contact_node.xpath('string(.//strong)') raw_email = contact_node.xpath('string(.//a[contains(., "@")]/@href)') email = re.match('(?:mailto:)?(.*)', raw_email).group(1) p = Legislator(name=name, post_id='Sault Ste. Marie', role='Mayor') p.add_source(COUNCIL_PAGE) p.add_contact('email', email, None) p.image = photo_url yield p #alternate between a row represneting a ward name and councilors for ward_row, data_row in zip(*[iter(council_data)] * 2): district = ward_row.xpath('string(.//text()[contains(., "Ward")])') district_num = district_name_using_number(district) for councillor_node in data_row.xpath('./td'): name = councillor_node.xpath('string(.//strong)') if not name: #bad markup name = councillor_node.xpath( 'string(.//strong/following-sibling::' 'text())') raw_email = councillor_node.xpath( 'string(.//a[contains(., "@")]/@href)') email = re.match('(?:mailto:)?(.*)', raw_email).group(1) photo_url_rel = councillor_node.xpath('string(.//img/@src)') photo_url = urljoin(COUNCIL_PAGE, photo_url_rel) # address and phone are brittle, inconsistent p = Legislator(name=name, post_id=district_num, role='Councillor') p.add_source(COUNCIL_PAGE) if email: p.add_contact('email', email, None) p.image = photo_url yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//a[contains(@title, "Profile")][1]/@href') for councillor in councillors: page = lxmlize(councillor) info = page.xpath('//table/tbody/tr/td[2]')[0] for br in info.xpath('*//br'): br.tail = '\n' + br.tail if br.tail else '\n' lines = [ line.strip() for line in info.text_content().split('\n') if line.strip() ] text = '\n'.join(lines) name = lines[0].replace('Councillor ', '').replace('Mayor ', '') if lines[1].endswith(' Ward'): district = lines[1].replace(' Ward', '') role = 'Councillor' elif lines[1] == 'At Large': district = 'Thunder Bay' role = 'Councillor' else: district = 'Thunder Bay' role = 'Mayor' name = name.replace('Councillor', '').replace('At Large', '').replace('Mayor', '').strip() p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(councillor) p.image = page.xpath('//td[@valign="top"]/img/@src')[0] address = ', '.join(info.xpath('./p/text()')[0:2]).strip() address = re.sub(r'\s{2,}', ' ', address) p.add_contact('address', address, 'legislature') contacts = info.xpath('./p[2]/text()') for contact in contacts: contact_type, contact = contact.split(':') contact = contact.replace('(1st)', '').replace('(2nd)', '').strip() if 'Fax' in contact_type: p.add_contact('fax', contact, 'legislature') elif 'Email' in contact_type: break else: p.add_contact('voice', contact, contact_type) email = info.xpath( './/a[contains(@href, "mailto:")]')[0].text_content() p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="printArea"]//table//tr//td')[4:-1] yield self.scrape_mayor(councillors[0]) for councillor in councillors[1:]: name = ' '.join( councillor.xpath('string(.//strong/a[last()])').split()) infostr = councillor.xpath('string(.//strong)') try: district = infostr.split('-')[1] role = 'Councillor' except IndexError: district = 'Newmarket' role = 'Regional Councillor' url = councillor.xpath('.//a/@href')[0] p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = councillor.xpath('.//img/@src')[0] page = lxmlize(url) info = page.xpath('//div[@id="printArea"]')[0] info = info.xpath('.//p[@class="heading"][2]/following-sibling::p') address = info.pop(0).text_content().strip() if not address: address = info.pop(0).text_content().strip() if 'Ward' in info[0].text_content(): info.pop(0) numbers = info.pop(0).text_content().split(':') email = page.xpath('//a[contains(@href, "mailto:")]/text()')[0] p.add_contact('email', email, None) for i, contact in enumerate(numbers): if i == 0: continue if '@' in contact: continue # executive assistant email else: number = re.findall(r'([0-9]{3}-[0-9]{3}-[0-9]{4})', contact)[0] ext = re.findall(r'(Ext\. [0-9]{3,4})', contact) if ext: number = number + ext[0].replace('Ext. ', ' x') contact_type = re.findall(r'[A-Za-z]+$', numbers[i - 1])[0] if 'Fax' in contact_type: p.add_contact('fax', number, 'legislature') elif 'Phone' in contact_type: p.add_contact('voice', number, 'legislature') else: p.add_contact(contact_type, number, contact_type) site = page.xpath('.//a[contains(text(), "http://")]') if site: p.add_link(site[0].text_content(), None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//h1[@class="title"]') for councillor in councillors: if not ',' in councillor.text_content(): continue name, district = councillor.text_content().split(',') name = name.strip() if 'Mayor' in district: p = Legislator(name=name, post_id='Beaconsfield', role='Maire') p.add_source(COUNCIL_PAGE) p.image = councillor.xpath( './parent::div/parent::div/p//img/@src')[0] phone = councillor.xpath( './/parent::div/following-sibling::div[contains(text(), "514")]/text()' )[0] phone = phone.split(':')[1].strip().replace(' ', '-') p.add_contact('voice', phone, 'legislature') script = councillor.xpath( './/parent::div/following-sibling::div/script' )[0].text_content() p.add_contact('email', get_email(script), None) yield p continue district = district.split('-')[1].strip() p = Legislator(name=name, post_id=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.image = councillor.xpath( './parent::div/parent::div/p//img/@src')[0] phone = councillor.xpath( './/parent::div/following-sibling::p[contains(text(), "514")]/text()' ) if phone: phone = phone[0] phone = phone.split(':')[1].strip().replace(' ', '-') p.add_contact('voice', phone, 'legislature') script = councillor.xpath( './/parent::div/following-sibling::p/script')[0].text_content( ) p.add_contact('email', get_email(script), None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//table[@class="table_style"]/tbody/tr')[1:] for councillor in councillors: name = councillor.xpath('.//a')[0].text_content() district = 'District %s' % councillor.xpath( './/strong')[0].text_content() address = councillor.xpath('.//td')[3].text_content().replace( "\r\n", ', ') phone = councillor.xpath('.//td[5]/p/text()')[0].split( ':')[1].replace("(", '').replace(") ", '-') fax = councillor.xpath('.//td[5]/p/text()')[1].split( ':')[1].replace("(", '').replace(") ", '-') p = Legislator(name=name, post_id=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') councillor_url = councillor.xpath('.//a/@href')[0] p.add_source(councillor_url) page = lxmlize(councillor_url) p.image = page.xpath('//img[@class="image_left"]/@src')[0] yield p mayorpage = lxmlize(MAYOR_PAGE) name_elem = mayorpage.xpath('//strong[contains(text(), "About")]')[0] name = re.search('About Mayor (.+):', name_elem.text).group(1) photo_url = mayorpage.xpath('string(//span/img/@src)') address_and_tel_elem = mayorpage.xpath( '//strong[contains(text(), "Contact")]/ancestor::p/' 'following-sibling::p[1]')[0] address = address_and_tel_elem[0].text_content() phone = address_and_tel_elem[2].text.split(':')[1] p = Legislator(name=name, post_id='Cape Breton', role='Mayor') p.add_source(MAYOR_PAGE) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') # email is protected through JS p.image = photo_url yield p
def get_people(self): root = lxmlize(COUNCIL_PAGE) everyone = root.xpath('//span[@class="Title"]') mayornode = everyone[0] mayor = {} spantext = ' '.join(mayornode.xpath('.//text()')) mayor['name'] = re.search(r'[^(]+', spantext).group(0).strip() mayor['photo_url'] = urljoin(COUNCIL_PAGE, mayornode.xpath('img/@src')[0]) mayor['email'] = mayornode.xpath('following::a[1]/text()')[0] m = Legislator(name=mayor['name'], post_id='Charlottetown', role='Mayor') m.add_source(COUNCIL_PAGE) m.add_contact('email', mayor['email'], None) m.image = mayor['photo_url'] yield m for span in root.xpath('//span[@class="Title"]')[1:]: spantext = ' '.join(span.xpath('.//text()')) header = spantext.replace(u'\u2013', '-').split('-') if len(header) != 2: continue name = header[0].strip() name = name.replace('Councillor', '') name = re.sub(r'\(.+?\)', '', name) name = ' '.join(name.split()) district_name = header[1].strip() district_id = ' '.join(header[1].split()[:2]) # needed a wacky xpath to deal with ward 8 photo = span.xpath('preceding::hr[1]/following::img[1]/@src') photo_url = urljoin(COUNCIL_PAGE, photo[0]) email = span.xpath('string(following::a[1]/text())') p = Legislator(name=name, post_id=district_id, role='Councillor') p.add_source(COUNCIL_PAGE) if email: p.add_contact('email', email, None) p.image = photo_url yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="printArea"]//table//tr//td')[4:-1] yield self.scrape_mayor(councillors[0]) for councillor in councillors[1:]: name = ' '.join(councillor.xpath('string(.//strong/a[last()])').split()) infostr = councillor.xpath('string(.//strong)') try: district = infostr.split('-')[1] role = 'Councillor' except IndexError: district = 'Newmarket' role = 'Regional Councillor' url = councillor.xpath('.//a/@href')[0] p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = councillor.xpath('.//img/@src')[0] page = lxmlize(url) info = page.xpath('//div[@id="printArea"]')[0] info = info.xpath('.//p[@class="heading"][2]/following-sibling::p') address = info.pop(0).text_content().strip() if not address: address = info.pop(0).text_content().strip() if 'Ward' in info[0].text_content(): info.pop(0) numbers = info.pop(0).text_content().split(':') email = page.xpath('//a[contains(@href, "mailto:")]/text()')[0] p.add_contact('email', email, None) for i, contact in enumerate(numbers): if i == 0: continue if '@' in contact: continue # executive assistant email else: number = re.findall(r'([0-9]{3}-[0-9]{3}-[0-9]{4})', contact)[0] ext = re.findall(r'(Ext\. [0-9]{3,4})', contact) if ext: number = number + ext[0].replace('Ext. ', ' x') contact_type = re.findall(r'[A-Za-z]+$', numbers[i - 1])[0] if 'Fax' in contact_type: p.add_contact('fax', number, 'legislature') elif 'Phone' in contact_type: p.add_contact('voice', number, 'legislature') else: p.add_contact(contact_type, number, contact_type) site = page.xpath('.//a[contains(text(), "http://")]') if site: p.add_link(site[0].text_content(), None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillor_links = page.xpath( '//span[@class="textimagetype"]//a[contains(text(), "- Ward")]') for councillor_link in councillor_links: name, district = councillor_link.text.split(' - ') cpage_url = councillor_link.attrib['href'] cpage = lxmlize(cpage_url) p = Legislator(name=name, post_id=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(cpage_url) email = cpage.xpath('string(//a[contains(@href, "@")])') p.add_contact('email', email, None) phone = cpage.xpath( 'string(//text()[contains(., "Phone")])').split(':')[1] p.add_contact('voice', phone, 'legislature') img_url_rel = cpage.xpath( 'string((//span/img)[1]/@src)') img_url = urljoin(cpage_url, img_url_rel) p.image = img_url yield p page = lxmlize(MAYOR_PAGE) name = ' '.join(page.xpath('//p[contains(text(), "is married to")]/text()')[0].split()[:2]) address = ' '.join(page.xpath('//p[contains(text(), "Mayor\'s Office")]/text()')[1:]) phone, fax = page.xpath('//p[contains(text(), "Phone:")]/text()')[:-1] phone = phone.strip().replace('(', '').replace(') ', '-') fax = fax.strip().replace('(', '').replace(') ', '-').split(':')[1] email = page.xpath('//a[contains(@href, "mailto:")]/text()')[0] p = Legislator(name=name, post_id='Windsor', role='Mayor') p.add_source(MAYOR_PAGE) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email, None) p.image = page.xpath('//div[@class="sectioning"]//img[contains(@title, "Mayor")]/@src')[0] yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) table_data = page.xpath('//div[@id="litcontentDiv"]//tr') council_data = table_data[2:-1] mayor_row = table_data[0] photo_url_rel = mayor_row.xpath('string(.//img/@src)') photo_url = urljoin(COUNCIL_PAGE, photo_url_rel) contact_node = mayor_row.xpath('./td')[1] name = contact_node.xpath('string(.//strong)') raw_email = contact_node.xpath('string(.//a[contains(., "@")]/@href)') email = re.match('(?:mailto:)?(.*)', raw_email).group(1) p = Legislator(name=name, post_id='Sault Ste. Marie', role='Mayor') p.add_source(COUNCIL_PAGE) p.add_contact('email', email, None) p.image = photo_url yield p #alternate between a row represneting a ward name and councilors for ward_row, data_row in zip(*[iter(council_data)]*2): district = ward_row.xpath('string(.//text()[contains(., "Ward")])') district_num = district_name_using_number(district) for councillor_node in data_row.xpath('./td'): name = councillor_node.xpath('string(.//strong)') if not name: #bad markup name = councillor_node.xpath('string(.//strong/following-sibling::' 'text())') raw_email = councillor_node.xpath('string(.//a[contains(., "@")]/@href)') email = re.match('(?:mailto:)?(.*)', raw_email).group(1) photo_url_rel = councillor_node.xpath('string(.//img/@src)') photo_url = urljoin(COUNCIL_PAGE, photo_url_rel) # address and phone are brittle, inconsistent p = Legislator(name=name, post_id=district_num, role='Councillor') p.add_source(COUNCIL_PAGE) if email: p.add_contact('email', email, None) p.image = photo_url yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//*[@class="two_third last"]') for councillor in councillors: if councillor == councillors[0]: yield self.scrape_mayor(councillor) continue name = councillor.xpath('.//a')[0].text_content().replace( 'Councillor', '').replace('Mayor', '') info = councillor.xpath('.//text()[normalize-space()]') district = info[2] url = councillor.xpath('.//a')[0].attrib['href'] p = Legislator(name=name, post_id=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('voice', info[3].replace('extension', 'x'), 'legislature') email = councillor.xpath('.//a[contains(@href,"mailto:")]') if email: email = email[0].text_content() p.add_contact('email', email, None) site = councillor.xpath('.//a[contains(text(),"Website")]') if site: p.add_link(site[0].attrib['href'], None) page = lxmlize(url) p.image = page.xpath('//header/img/@src')[0] address = re.findall( r'Address: (.*)Phone', page.xpath('//div[@class="entry-content"]')[0].text_content()) if address: p.add_contact('address', address[0], 'legislature') blog = page.xpath('//a[contains(text(),"Blog")]') if blog: p.add_link(blog[0].attrib['href'], None) facebook = page.xpath( '//div[@class="entry-content"]//a[contains(@href, "facebook")]' ) if facebook: p.add_link(facebook[0].attrib['href'], None) twitter = page.xpath( '//div[@class="entry-content"]//a[contains(@href, "twitter")]') if twitter: p.add_link(twitter[0].attrib['href'], None) yield p
def mayor_data(url): page = lxmlize(url) # Eliminate the word "Mayor" preceding the Mayor's name name = page.xpath('string(//h1)')[6:] p = Legislator(name=name, post_id='Waterloo', role='Mayor') p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = photo_url(page) return p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//table[@class="table_style"]/tbody/tr')[1:] for councillor in councillors: name = councillor.xpath('.//a')[0].text_content() district = 'District %s' % councillor.xpath('.//strong')[0].text_content() address = councillor.xpath('.//td')[3].text_content().replace("\r\n", ', ') phone = councillor.xpath('.//td[5]/p/text()')[0].split(':')[1].replace("(", '').replace(") ", '-') fax = councillor.xpath('.//td[5]/p/text()')[1].split(':')[1].replace("(", '').replace(") ", '-') p = Legislator(name=name, post_id=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') councillor_url = councillor.xpath('.//a/@href')[0] p.add_source(councillor_url) page = lxmlize(councillor_url) p.image = page.xpath('//img[@class="image_left"]/@src')[0] yield p mayorpage = lxmlize(MAYOR_PAGE) name_elem = mayorpage.xpath('//strong[contains(text(), "About")]')[0] name = re.search('About Mayor (.+):', name_elem.text).group(1) photo_url = mayorpage.xpath('string(//span/img/@src)') address_and_tel_elem = mayorpage.xpath( '//strong[contains(text(), "Contact")]/ancestor::p/' 'following-sibling::p[1]')[0] address = address_and_tel_elem[0].text_content() phone = address_and_tel_elem[2].text.split(':')[1] p = Legislator(name=name, post_id='Cape Breton', role='Mayor') p.add_source(MAYOR_PAGE) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') # email is protected through JS p.image = photo_url yield p
def councillor_data(url, name, ward): page = lxmlize(url) # sadly, email is a form on a separate page phone = page.xpath('string(//strong[contains(., "Phone")])').split(':')[1] photo_url_rel = page.xpath('string(//div[@id="contentcontainer"]//img/@src)') photo_url = urljoin(url, photo_url_rel) m = Legislator(name=name, post_id=ward, role='Councillor') m.add_source(COUNCIL_PAGE) m.add_source(url) m.add_contact('voice', phone, 'legislature') m.image = photo_url yield m
def mayor_data(node): name = node.xpath('string(.//strong)')[6:] phone = node.xpath('string(.//p[2]/text()[1])') email = node.xpath('string((.//a)[1])') photo_url = node.xpath('string(.//img/@src)') p = Legislator(name=name, post_id='Hamilton', role='Mayor') p.add_source(COUNCIL_PAGE) p.add_contact('email', email, None) p.add_contact('voice', phone, 'legislature') p.image = photo_url return p
def councillor_data(html): name = html.xpath('string(./div[@class="councillorInfo"]/a/text()[2])') email = html.xpath('string(./div[@class="emailInfo"])') district, phone = html.xpath('./div[@class="wardInfo"]/text()') photo = html.xpath('string((.//@src)[1])') p = Legislator(name=name, post_id=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) p.image = photo return p
def councillor_data(url, name, ward): page = lxmlize(url) # sadly, email is a form on a separate page phone = page.xpath('string(//strong[contains(., "Phone")])').split(':')[1] photo_url_rel = page.xpath( 'string(//div[@id="contentcontainer"]//img/@src)') photo_url = urljoin(url, photo_url_rel) m = Legislator(name=name, post_id=ward, role='Councillor') m.add_source(COUNCIL_PAGE) m.add_source(url) m.add_contact('voice', phone, 'legislature') m.image = photo_url yield m