def bos_scrape_people(self): page = self.lxmlize(MEMBER_LIST) people = page.xpath( "//table[@width='100%']//td[@style='TEXT-ALIGN: center']") for person in people: image, name = [ self.get_one(person, x) for x in [ ".//img", ".//a[contains(@href, 'councillors') and (text()!='')]" ] ] role = person.xpath(".//br")[0].tail.strip() image = image.attrib[ 'src'] # Fallback if we don't get one from the # homepage. homepage = name.attrib['href'] name = clean_name(name.text) info = self.scrape_homepage(homepage) if info.get('image', None): image = info['image'] p = Legislator(name=name, post_id=role, image=image, biography=info['bio']) p.add_link(homepage, 'homepage') p.add_source(homepage) p.add_source(MEMBER_LIST) yield p
def _scrape_people(self): url = 'http://www.cabq.gov/council/councilors' page = self.lxmlize(url) names = page.xpath("//div[@id='parent-fieldname-text']/*")[3:] it = iter(names) for entry in zip(it, it, it): name, info, _ = entry image_small = name.xpath(".//img")[0].attrib['src'] name = name.text_content() infopage, email, policy_analyst = info.xpath(".//a") phone = info.xpath(".//b")[-1].tail.strip() district = infopage.text_content() homepage = self.lxmlize(infopage.attrib['href']) photo = homepage.xpath( "//div[@class='featureContent']//img")[0].attrib['src'] bio = "\n".join((x.text_content() for x in homepage.xpath( "//div[@class='featureContent']//div[@class='stx']/p"))) p = Legislator(name=name, district=district, image=photo, biography=bio) p.add_source(url) p.add_source(infopage.attrib['href']) yield p
def scrape(self): page = self.lxmlize(MEMBER_LIST) for row in page.xpath("//table[@frame='void']/tbody/tr")[1:]: role, whos, expire = row.xpath("./*") people = zip([x.text_content() for x in whos.xpath(".//font")], [x.text_content() for x in expire.xpath(".//font")]) thing = role.text_content() comm = Committee(name=thing) url = role.xpath(".//a")[0].attrib['href'] comm.add_link(url=url, note='homepage') for person, expire in people: if "TBA" in person: continue info = {} try: info = re.match("(?P<name>.*), (?P<addr>\d+\w* .*)", person).groupdict() except AttributeError: info = re.match("(?P<name>.*) (?P<addr>\d+\w* .*)", person).groupdict() addr = info['addr'] roles = {"Vice Chair": "Vice Chair", "Chair": "Chair", "CHAIR": "Chair", "Appt": "member",} position = "member" if "Resigned" in addr: continue for role in roles: if role in addr: addr, chair = [x.strip() for x in addr.rsplit(role, 1)] position = roles[role] addr = clean_address(addr) leg = Legislator(name=info['name'], district=position) leg.add_contact_detail(type="address", value=addr, note="Address") leg.add_source(MEMBER_LIST) yield leg leg.add_membership(comm) comm.add_source(MEMBER_LIST) yield comm
def get_people(self): html = self.urlopen(self.url) doc = lxml.html.fromstring(html) title_xpath = '//div[contains(@class, "biotitle")]' name_xpath = '//div[contains(@class, "bioname")]' for title, name in zip(doc.xpath(title_xpath), doc.xpath(name_xpath)): name = name.text_content().strip() title = title.text_content().strip() p = Legislator(name=name, post_id=title) p.add_source(self.url) yield p
def cleveland_scrape_people(self): listing = "http://www.clevelandcitycouncil.org/council-members/" page = self.lxmlize(listing) table = page.xpath("//div[@class='standard-content column']//table")[0] for person in table.xpath(".//td[@align='center']"): strong = person.xpath(".//strong")[0] who = strong.text.strip() role = strong.xpath("./br")[0].tail.strip() img = person.xpath(".//img")[0].attrib['src'] info = INFOSLUG.match(role).groupdict() scraped_info = {} page = person.xpath(".//a") if page != []: page = page[0].attrib['href'] scraped_info = self.scrape_page(page) kwargs = {} biography = scraped_info.get('bio', None) if biography: kwargs['biography'] = biography p = Legislator(name=who, post_id=info['district'], gender=info['gender'], image=img, **kwargs) p.add_source(listing) valid_titles = ["Chair", "Vice Chair"] for what in scraped_info.get('committees', []): what = what.strip() if what == "": continue role = "member" if "-" in what: c, title = (x.strip() for x in what.rsplit("-", 1)) if title in valid_titles: what = c role = title p.add_committee_membership(what, role=role) yield p
def scrape_ward(self, el): url = el.attrib['href'] page = self.lxmlize(url) name = page.xpath("//div[@id='content-content']/h3")[0].text_content() badthings = ["Alderman"] for thing in badthings: if name.startswith(thing): name = name[len(thing):].strip() district = page.xpath("//h1[@class='page-heading']/text()")[0] leg = Legislator(name=name, post_id=district) leg.add_source(url) type_types = { "City Hall Office:": ("address", "City Hall Office"), "City Hall Phone:": ("phone", "City Hall Phone"), "Phone:": ("phone", "Personal Phone"), "Office:": ("address", "Personal Office"), "Fax:": ("fax", "Fax"), "Fax": ("fax", "Fax"), } for row in page.xpath("//table//tr"): type_, val = (x.text_content().strip() for x in row.xpath("./td")) if val == "": continue types = [type_] vals = [val] if "\n" in type_: if "\n" in val: types = type_.split("\n") vals = val.split("\n") else: continue for type_ in types: for val in vals: ctype, note = type_types[type_] leg.add_contact(ctype, val, note) return leg
def scrape_homepage(self, folk): url = folk.attrib['href'] page = self.lxmlize(url) image = page.xpath( "//img[contains(@src, 'uploadedImages/City_Council/Members/')]" )[0].attrib['src'] name = page.xpath("//div[@id='ctl00_ctl00_Body_body_cntCommon']/h3") name, = name bio = "\n\n".join([ x.text_content() for x in page.xpath( "//div[@id='ctl00_ctl00_Body_body_cntCommon']/p") ]) leg = Legislator(name=name.text, post_id='member', biography=bio, image=image) leg.add_source(url) return leg
def nyc_scrape_people(self): page = self.lxmlize(MEMBER_PAGE) for entry in page.xpath("//table[@id='members_table']//tr"): entries = entry.xpath(".//td") if entries == []: continue name, district, borough, party = entries name = name.xpath(".//a")[0] homepage = name.attrib['href'] name, district, borough, party = [ x.text for x in [name, district, borough, party] ] info = self.scrape_homepage(homepage) p = Legislator( name=name, post_id=district, # borough=borough, party=party.strip() or "other") p.add_link(homepage, 'homepage') p.add_source(homepage) p.add_source(MEMBER_PAGE) yield p
def get_people(self): people = [ { "name": "Mckenzie A. Cannon", "district": "10a", }, { "name": "Yandel V. Watkins", "district": "Second Fnord and Norfolk", }, { "name": "Adrien A. Coffey", "district": "A", }, { "district": "10c", "name": "Natasha Moon", }, { "district": "Berkshire, Hampshire, Franklin and Hampden", "name": "Ramon Harmon", }, { "district": "5", "name": "Sam Sellers", }, { "district": "6", "name": "Estrella Hahn", }, { "district": "B", "name": "Teagan Rojas", }, { "district": "C", "name": "Barrett Adams", }, { "district": "D", "name": "Kayla Shelton", }, { "district": "E", "name": "Kohen Dudley", }, { "district": "F", "name": "Cayden Norman", }, { "district": "ZZ", "name": "Shayla Fritz", }, { "district": "Ward 2", "name": "Gunnar Luna", }, { "district": "Green", "name": "Regina Cruz", }, { "district": "Blue", "name": "Makenzie Keller", }, { "district": "Red", "name": "Eliana Meyer", }, { "district": "Yellow", "name": "Taylor Parrish", }, { "district": "Silver", "name": "Callie Craig", }, ] for person in people: l = Legislator(**person) l.add_source("http://example.com") dslug = (person['district'].lower().replace(" ", "-").replace(",", "")) l.add_contact_detail(type='email', value="*****@*****.**" % (dslug), note='office email') yield l