def get_people(self): urls = Urls(dict(list=legislators_url), self) council = Organization( 'Temecula City Council', classification='legislature') council.add_source(urls.list.url) yield council for tr in urls.list.xpath('//table[2]//tr')[1:]: # Parse some attributes. name, role = tr.xpath('td/p[1]//font/text()') image = tr.xpath('td/img/@src').pop() # Create legislator. person = Person(name, image=image) # Add membership on council. memb = person.add_membership(council, role=role) # Add email address. email, detail_url = tr.xpath('td//a/@href') email = email[7:] memb.contact_details.append( dict(type='email', value=email, note='work')) # Add sources. person.add_source(urls.list.url) person.add_source(detail_url) yield person
def get_people(self): # committee tech = Organization('Technology') tech.add_post('Chairman', 'chairman') tech.add_source('https://example.com') yield tech # subcommittee ecom = Organization('Subcommittee on E-Commerce', parent=tech) ecom.add_source('https://example.com') yield ecom p = Person('Paul Tagliamonte', district='6', chamber='upper') p.add_membership(tech, role='chairman') p.add_source('https://example.com') yield p
def get_people(self): urls = Urls(dict(list=legislators_url), self) council = Organization('Temecula City Council', classification='legislature') council.add_source(urls.list.url) yield council for tr in urls.list.xpath('//table[2]//tr')[1:]: # Parse some attributes. name, role = tr.xpath('td/p[1]//font/text()') image = tr.xpath('td/img/@src').pop() # Create legislator. person = Person(name, image=image) # Add membership on council. memb = person.add_membership(council, role=role) # Add email address. email, detail_url = tr.xpath('td//a/@href') email = email[7:] memb.contact_details.append( dict(type='email', value=email, note='work')) # Add sources. person.add_source(urls.list.url) person.add_source(detail_url) yield person
def get_people(self): urls = Urls(dict(list=legislators_url), self) council = Organization('Boise City Council') council.add_source(legislators_url) yield council xpath = '//div[@id="content"]/div/a/@href' people_urls = urls.list.xpath(xpath) # SKip the mayor because his page has no name or email. people_urls = people_urls[1:] for url in people_urls: urls.add(detail=url) # Parse some attributes. image = urls.detail.xpath('//div[@id="content"]/p/img/@src').pop() name = urls.detail.xpath('//h1/text()').pop() name = name.replace('Council ', '') role, _, name = name.partition(' ') # Create legislator. person = Person(name, image=image) # Add membership on council. memb = person.add_membership(council, role=role) memb.add_source(urls.detail.url) # Add email address. email_xpath = '//a[contains(@href, "mailto")]/@href' email = urls.detail.xpath(email_xpath).pop()[7:] memb.contact_details.append( dict(type='email', value=email, note='work')) # Add sources. person.add_source(urls.list.url) person.add_source(urls.detail.url) yield person
def get_people(self): urls = Urls(dict(list=legislators_url), self) council = Organization('Denver City Council') council.add_source(legislators_url) # Get image urls, names, detail urls, and districts. image_xpath = '//a[contains(@href, "councildistrict")]/img/@src' image_urls = urls.list.xpath(image_xpath) name_xpath = '//a[contains(@href, "councildistrict")]' names = [a.text_content() for a in urls.list.xpath(name_xpath)][:-1] names = filter(None, names) person_urls_xpath = '//a[contains(@href, "councildistrict")]/@href' person_urls = urls.list.xpath(person_urls_xpath) post_ids = [] xpath = '//a[contains(@href, "councildistrict")]/img/ancestor::td' for td in urls.list.xpath(xpath): text = td.text_content() m = re.search('Council District \d+', text) if m: post_ids.append(m.group()) continue m = re.search('Council At-Large', text) if m: post_ids.append('Council At-Large') for post_id in post_ids: council.add_post(post_id, post_id) yield council data = zip(image_urls, names, person_urls, post_ids) for image_url, name, person_url, post_id in data: # Create legislator. person = Person(name, image=image_url) # Add sources. urls.add(detail=person_url) person.add_source(urls.list.url, note='list') person.add_source(urls.detail.url, note='detail') # Add membership on council. memb = person.add_membership(council, post_id=post_id.strip()) memb.add_source(urls.detail.url) xpath = '//div[@id="dnn_column3"]' contact_text = urls.detail.xpath(xpath)[0].text_content() if not contact_text.strip(): xpath = '//div[contains(@id, "dnn_RightPaneWide")]' contact_text = urls.detail.xpath(xpath)[0].text_content() phone_regex = r'\(\d{3}\)[ -]*\d{3}-\d{4}' phone = re.search(phone_regex, contact_text).group() memb.contact_details.append( dict(type='phone', value=phone, note='work')) # Add email address. email_regex = r'\[email protected]' email = re.search(email_regex, contact_text).group() memb.contact_details.append( dict(type='email', value=email, note='work')) yield person