def _scrape_people(self): url = 'http://www.cabq.gov/council/councilors' page = self.lxmlize(url) names = page.xpath("//div[@id='parent-fieldname-text']/*")[3:] it = iter(names) for entry in zip(it, it, it): name, info, _ = entry image_small = name.xpath(".//img")[0].attrib['src'] name = name.text_content() infopage, email, policy_analyst = info.xpath(".//a") phone = info.xpath(".//b")[-1].tail.strip() district = infopage.text_content() homepage = self.lxmlize(infopage.attrib['href']) photo = homepage.xpath( "//div[@class='featureContent']//img" )[0].attrib['src'] bio = "\n".join((x.text_content() for x in homepage.xpath( "//div[@class='featureContent']//div[@class='stx']/p"))) p = Legislator(name=name, post_id=district, image=photo, biography=bio) p.add_source(url) p.add_source(infopage.attrib['href']) yield p
def test_legislator_related_party(): l = Legislator('John Adams', district='1', party='Democratic-Republican') l.pre_save('jurisdiction-id') # a party membership assert len(l._related) == 2 assert l._related[1].person_id == l._id assert get_psuedo_id(l._related[1].organization_id) == {'classification': 'party', 'name': 'Democratic-Republican'} assert l._related[1].role == 'member'
def test_legislator_related_district(): l = Legislator('John Adams', district='1') l.pre_save('jurisdiction-id') assert len(l._related) == 1 assert l._related[0].person_id == l._id assert get_psuedo_id(l._related[0].organization_id) == {'chamber': '', 'classification': 'legislature'} assert get_psuedo_id(l._related[0].post_id) == { "label": "1" } assert l._related[0].role == 'member'
def scrape_homepage(self, folk): url = folk.attrib["href"] page = self.lxmlize(url) image = page.xpath("//img[contains(@src, 'uploadedImages/City_Council/Members/')]")[0].attrib["src"] name = page.xpath("//div[@id='ctl00_ctl00_Body_body_cntCommon']/h3") name, = name bio = "\n\n".join([x.text_content() for x in page.xpath("//div[@id='ctl00_ctl00_Body_body_cntCommon']/p")]) leg = Legislator(name=name.text, district="member", biography=bio, image=image) leg.add_source(url) return leg
def get_people(self): html = self.urlopen(self.url) doc = lxml.html.fromstring(html) title_xpath = '//div[contains(@class, "biotitle")]' name_xpath = '//div[contains(@class, "bioname")]' for title, name in zip(doc.xpath(title_xpath), doc.xpath(name_xpath)): name = name.text_content().strip() title = title.text_content().strip() p = Legislator(name=name, post_id=title) p.add_source(self.url) yield p
def bos_scrape_people(self): page = self.lxmlize(MEMBER_LIST) people = page.xpath( "//table[@width='100%']//td[@style='TEXT-ALIGN: center']") for person in people: image, name = [ self.get_one(person, x) for x in [ ".//img", ".//a[contains(@href, 'councillors') and (text()!='')]" ] ] role = person.xpath(".//br")[0].tail.strip() image = image.attrib[ 'src'] # Fallback if we don't get one from the # homepage. homepage = name.attrib['href'] name = clean_name(name.text) info = self.scrape_homepage(homepage) if info.get('image', None): image = info['image'] p = Legislator(name=name, post_id=role, image=image, biography=info['bio']) p.add_link(homepage, 'homepage') p.add_source(homepage) p.add_source(MEMBER_LIST) yield p
def cleveland_scrape_people(self): listing = "http://www.clevelandcitycouncil.org/council-members/" page = self.lxmlize(listing) table = page.xpath("//div[@class='standard-content column']//table")[0] for person in table.xpath(".//td[@align='center']"): strong = person.xpath(".//strong")[0] who = strong.text.strip() role = strong.xpath("./br")[0].tail.strip() img = person.xpath(".//img")[0].attrib['src'] info = INFOSLUG.match(role).groupdict() scraped_info = {} page = person.xpath(".//a") if page != []: page = page[0].attrib['href'] scraped_info = self.scrape_page(page) kwargs = {} biography = scraped_info.get('bio', None) if biography: kwargs['biography'] = biography p = Legislator(name=who, district=info['district'], gender=info['gender'], image=img, **kwargs) p.add_source(listing) valid_titles = [ "Chair", "Vice Chair" ] for what in scraped_info.get('committees', []): what = what.strip() if what == "": continue role = "member" if "-" in what: c, title = (x.strip() for x in what.rsplit("-", 1)) if title in valid_titles: what = c role = title p.add_committee_membership(what, role=role) yield p
def get_people(self): # committee tech = Organization('Technology', classification='committee') tech.add_post('Chairman', 'chairman') yield tech # subcommittee ecom = Organization('Subcommittee on E-Commerce', parent=tech, classification='committee') yield ecom p = Legislator('Paul Tagliamonte', district='6', chamber='upper', party='Independent') p.add_committee_membership('Finance') p.add_membership(tech, role='chairman') yield p
def scrape_ward(self, el): url = el.attrib['href'] page = self.lxmlize(url) name = page.xpath("//div[@id='content-content']/h3")[0].text_content() badthings = [ "Alderman" ] for thing in badthings: if name.startswith(thing): name = name[len(thing):].strip() district = page.xpath("//h1[@class='page-heading']/text()")[0] leg = Legislator(name=name, post_id=district) leg.add_source(url) type_types = { "City Hall Office:": ("address", "City Hall Office"), "City Hall Phone:": ("phone", "City Hall Phone"), "Phone:": ("phone", "Personal Phone"), "Office:": ("address", "Personal Office"), "Fax:": ("fax", "Fax"), "Fax": ("fax", "Fax"), } for row in page.xpath("//table//tr"): type_, val = (x.text_content().strip() for x in row.xpath("./td")) if val == "": continue types = [type_] vals = [val] if "\n" in type_: if "\n" in val: types = type_.split("\n") vals = val.split("\n") else: continue for type_ in types: for val in vals: ctype, note = type_types[type_] leg.add_contact(ctype, val, note) return leg
def bos_scrape_people(self): page = self.lxmlize(MEMBER_LIST) people = page.xpath( "//table[@width='100%']//td[@style='TEXT-ALIGN: center']") for person in people: image, name = [self.get_one(person, x) for x in [ ".//img", ".//a[contains(@href, 'councillors') and (text()!='')]" ]] role = person.xpath(".//br")[0].tail.strip() image = image.attrib['src'] # Fallback if we don't get one from the # homepage. homepage = name.attrib['href'] name = clean_name(name.text) info = self.scrape_homepage(homepage) if info.get('image', None): image = info['image'] p = Legislator(name=name, post_id=role, image=image, biography=info['bio']) p.add_link(homepage, 'homepage') p.add_source(homepage) p.add_source(MEMBER_LIST) yield p
def scrape(self): page = self.lxmlize(MEMBER_LIST) for row in page.xpath("//table[@frame='void']/tbody/tr")[1:]: role, whos, expire = row.xpath("./*") people = zip([x.text_content() for x in whos.xpath(".//font")], [x.text_content() for x in expire.xpath(".//font")]) thing = role.text_content() comm = Committee(name=thing) url = role.xpath(".//a")[0].attrib['href'] comm.add_link(url=url, note='homepage') for person, expire in people: if "TBA" in person: continue info = {} try: info = re.match("(?P<name>.*), (?P<addr>\d+\w* .*)", person).groupdict() except AttributeError: info = re.match("(?P<name>.*) (?P<addr>\d+\w* .*)", person).groupdict() addr = info['addr'] roles = {"Vice Chair": "Vice Chair", "Chair": "Chair", "CHAIR": "Chair", "Appt": "member",} position = "member" if "Resigned" in addr: continue for role in roles: if role in addr: addr, chair = [x.strip() for x in addr.rsplit(role, 1)] position = roles[role] addr = clean_address(addr) leg = Legislator(name=info['name'], district=position) leg.add_contact_detail(type="address", value=addr, note="Address") leg.add_source(MEMBER_LIST) yield leg leg.add_membership(comm) comm.add_source(MEMBER_LIST) yield comm
def scrape_homepage(self, folk): url = folk.attrib['href'] page = self.lxmlize(url) image = page.xpath( "//img[contains(@src, 'uploadedImages/City_Council/Members/')]" )[0].attrib['src'] name = page.xpath("//div[@id='ctl00_ctl00_Body_body_cntCommon']/h3") name, = name bio = "\n\n".join([ x.text_content() for x in page.xpath( "//div[@id='ctl00_ctl00_Body_body_cntCommon']/p") ]) leg = Legislator(name=name.text, post_id='member', biography=bio, image=image) leg.add_source(url) return leg
def _scrape_people(self): url = 'http://www.cabq.gov/council/councilors' page = self.lxmlize(url) names = page.xpath("//div[@id='parent-fieldname-text']/*")[3:] it = iter(names) for entry in zip(it, it, it): name, info, _ = entry image_small = name.xpath(".//img")[0].attrib['src'] name = name.text_content() infopage, email, policy_analyst = info.xpath(".//a") phone = info.xpath(".//b")[-1].tail.strip() district = infopage.text_content() homepage = self.lxmlize(infopage.attrib['href']) photo = homepage.xpath( "//div[@class='featureContent']//img")[0].attrib['src'] bio = "\n".join((x.text_content() for x in homepage.xpath( "//div[@class='featureContent']//div[@class='stx']/p"))) p = Legislator(name=name, district=district, image=photo, biography=bio) p.add_source(url) p.add_source(infopage.attrib['href']) yield p
def get_people(self): people = [ {"name": "Mckenzie A. Cannon", "district": "10a",}, {"name": "Yandel V. Watkins", "district": "Second Fnord and Norfolk",}, {"name": "Adrien A. Coffey", "district": "A",}, {"district": "10c", "name": "Natasha Moon",}, {"district": "Berkshire, Hampshire, Franklin and Hampden", "name": "Ramon Harmon",}, {"district": "5", "name": "Sam Sellers",}, {"district": "6", "name": "Estrella Hahn",}, {"district": "B", "name": "Teagan Rojas",}, {"district": "C", "name": "Barrett Adams",}, {"district": "D", "name": "Kayla Shelton",}, {"district": "E", "name": "Kohen Dudley",}, {"district": "F", "name": "Cayden Norman",}, {"district": "ZZ", "name": "Shayla Fritz",}, {"district": "Ward 2", "name": "Gunnar Luna",}, {"district": "Green", "name": "Regina Cruz",}, {"district": "Blue", "name": "Makenzie Keller",}, {"district": "Red", "name": "Eliana Meyer",}, {"district": "Yellow", "name": "Taylor Parrish",}, {"district": "Silver", "name": "Callie Craig",}, ] for person in people: l = Legislator(**person) l.add_source("http://example.com") dslug = ( person['district'].lower().replace(" ", "-").replace(",", "")) l.add_contact_detail( type='email', value="*****@*****.**" % (dslug), note='office email' ) yield l
def nyc_scrape_people(self): page = self.lxmlize(MEMBER_PAGE) for entry in page.xpath("//table[@id='members_table']//tr"): entries = entry.xpath(".//td") if entries == []: continue name, district, borough, party = entries name = name.xpath(".//a")[0] homepage = name.attrib['href'] name, district, borough, party = [ x.text for x in [name, district, borough, party] ] info = self.scrape_homepage(homepage) p = Legislator( name=name, post_id=district, # borough=borough, party=party.strip() or "other") p.add_link(homepage, 'homepage') p.add_source(homepage) p.add_source(MEMBER_PAGE) yield p
def cleveland_scrape_people(self): listing = "http://www.clevelandcitycouncil.org/council-members/" page = self.lxmlize(listing) table = page.xpath("//div[@class='standard-content column']//table")[0] for person in table.xpath(".//td[@align='center']"): strong = person.xpath(".//strong")[0] who = strong.text.strip() role = strong.xpath("./br")[0].tail.strip() img = person.xpath(".//img")[0].attrib['src'] info = INFOSLUG.match(role).groupdict() scraped_info = {} page = person.xpath(".//a") if page != []: page = page[0].attrib['href'] scraped_info = self.scrape_page(page) kwargs = {} biography = scraped_info.get('bio', None) if biography: kwargs['biography'] = biography p = Legislator(name=who, post_id=info['district'], gender=info['gender'], image=img, **kwargs) p.add_source(listing) valid_titles = ["Chair", "Vice Chair"] for what in scraped_info.get('committees', []): what = what.strip() if what == "": continue role = "member" if "-" in what: c, title = (x.strip() for x in what.rsplit("-", 1)) if title in valid_titles: what = c role = title p.add_committee_membership(what, role=role) yield p
def nyc_scrape_people(self): page = self.lxmlize(MEMBER_PAGE) for entry in page.xpath("//table[@id='members_table']//tr"): entries = entry.xpath(".//td") if entries == []: continue name, district, borough, party = entries name = name.xpath(".//a")[0] homepage = name.attrib['href'] name, district, borough, party = [x.text for x in [name, district, borough, party]] info = self.scrape_homepage(homepage) p = Legislator(name=name, post_id=district, # borough=borough, party=party.strip() or "other") p.add_link(homepage, 'homepage') p.add_source(homepage) p.add_source(MEMBER_PAGE) yield p
def scrape_ward(self, el): url = el.attrib['href'] page = self.lxmlize(url) name = page.xpath("//div[@id='content-content']/h3")[0].text_content() badthings = ["Alderman"] for thing in badthings: if name.startswith(thing): name = name[len(thing):].strip() district = page.xpath("//h1[@class='page-heading']/text()")[0] leg = Legislator(name=name, post_id=district) leg.add_source(url) type_types = { "City Hall Office:": ("address", "City Hall Office"), "City Hall Phone:": ("phone", "City Hall Phone"), "Phone:": ("phone", "Personal Phone"), "Office:": ("address", "Personal Office"), "Fax:": ("fax", "Fax"), "Fax": ("fax", "Fax"), } for row in page.xpath("//table//tr"): type_, val = (x.text_content().strip() for x in row.xpath("./td")) if val == "": continue types = [type_] vals = [val] if "\n" in type_: if "\n" in val: types = type_.split("\n") vals = val.split("\n") else: continue for type_ in types: for val in vals: ctype, note = type_types[type_] leg.add_contact(ctype, val, note) return leg
def get_people(self): people = [ { "name": "Mckenzie A. Cannon", "district": "10a", }, { "name": "Yandel V. Watkins", "district": "Second Fnord and Norfolk", }, { "name": "Adrien A. Coffey", "district": "A", }, { "district": "10c", "name": "Natasha Moon", }, { "district": "Berkshire, Hampshire, Franklin and Hampden", "name": "Ramon Harmon", }, { "district": "5", "name": "Sam Sellers", }, { "district": "6", "name": "Estrella Hahn", }, { "district": "B", "name": "Teagan Rojas", }, { "district": "C", "name": "Barrett Adams", }, { "district": "D", "name": "Kayla Shelton", }, { "district": "E", "name": "Kohen Dudley", }, { "district": "F", "name": "Cayden Norman", }, { "district": "ZZ", "name": "Shayla Fritz", }, { "district": "Ward 2", "name": "Gunnar Luna", }, { "district": "Green", "name": "Regina Cruz", }, { "district": "Blue", "name": "Makenzie Keller", }, { "district": "Red", "name": "Eliana Meyer", }, { "district": "Yellow", "name": "Taylor Parrish", }, { "district": "Silver", "name": "Callie Craig", }, ] for person in people: l = Legislator(**person) l.add_source("http://example.com") dslug = (person['district'].lower().replace(" ", "-").replace(",", "")) l.add_contact_detail(type='email', value="*****@*****.**" % (dslug), note='office email') yield l
def scrape_mayor(self): url = 'http://www1.toronto.ca/wps/portal/contentonly?vgnextoid=e53332d0b6d1e310VgnVCM10000071d60f89RCRD&vgnextfmt=default' page = self.lxmlize(url) name = page.xpath("//div[@class='detail']//h1/text()")[0].replace("Toronto Mayor","").strip() p = Legislator(name,"Toronto") p.add_source(COUNCIL_PAGE) p.add_source(url) url = page.xpath('//a[contains(text(),"Contact the Mayor")]')[0].attrib['href'] p.add_source(url) page = self.lxmlize(url) info = page.xpath('//div[@class="detail"]')[0] address = (', ').join(info.xpath('.//p/text()')[0:6]).replace(",,",",") phone = info.xpath('.//p[3]/text()')[0] p.add_contact('address',address,'Mailing') p.add_contact('phone',phone,'') #t = Toronto() #s = TorontoPersonScraper(t,'m','/Users/alexio/Desktop',) #s.toronto_scrape_people() #s.toronto_scrape_people()
def scrape_councilor(self, url): page = self.lxmlize(url) info = page.xpath("//div[@class='main']")[0] name = info.xpath("//h3")[1].text_content().replace('Councillor','').strip() district = info.xpath("//p")[0].text_content() p = Legislator(name=name, district=district) info = info.xpath("//div[@class='last']")[0] # add links p.add_source(url) p.add_source(COUNCIL_PAGE) if "website:" in info.text_content(): p.add_link(info.xpath('.//a')[1].attrib['href'], 'homepage') if "Facebook" in info.text_content(): p.add_link(info.xpath('//a[contains(@href, "facebook.com")]')[0].attrib['href'],'facebook') if "Twitter" in info.text_content(): p.add_link(info.xpath('//a[contains(@href,"twitter.com")]')[0].attrib['href'],'twitter') # add contact info p.add_contact('email', info.xpath('.//a')[0].text_content(),'') #//*[@id="content"]/div/div[1]/div[2]/p[1] contacts = info.xpath('//div/p[text()[contains(.,"Phone:")]]') for contact in contacts: note = contact.xpath('.//strong')[0].text_content() contact = contact.xpath('br/following-sibling::node()') if len(contact) > 8 : continue if len(contact) >= 4: address = (contact[0]+", "+contact[2]).strip() p.add_contact('address',address,note) if "Phone: " in contact[4]: phone = contact[4].replace("Phone: ",'').strip() p.add_contact('phone',phone,note) if len(contact) > 5 and "Fax:" in contact[6]: fax = contact[6].replace("Fax: ",'').strip() p.add_contact('fax',fax,note) else: phone = contact[0].strip() p.add_contact('phone',phone,note) fax = contact[2].strip() p.add_contact('fax',fax,note)