def bos_scrape_people(self): page = self.lxmlize(MEMBER_LIST) people = page.xpath( "//table[@width='100%']//td[@style='TEXT-ALIGN: center']") for person in people: image, name = [ self.get_one(person, x) for x in [ ".//img", ".//a[contains(@href, 'councillors') and (text()!='')]" ] ] role = person.xpath(".//br")[0].tail.strip() image = image.attrib[ 'src'] # Fallback if we don't get one from the # homepage. homepage = name.attrib['href'] name = clean_name(name.text) info = self.scrape_homepage(homepage) if info.get('image', None): image = info['image'] p = Legislator(name=name, post_id=role, image=image, biography=info['bio']) p.add_link(homepage, 'homepage') p.add_source(homepage) p.add_source(MEMBER_LIST) yield p
def bos_scrape_people(self): page = self.lxmlize(MEMBER_LIST) people = page.xpath( "//table[@width='100%']//td[@style='TEXT-ALIGN: center']") for person in people: image, name = [self.get_one(person, x) for x in [ ".//img", ".//a[contains(@href, 'councillors') and (text()!='')]" ]] role = person.xpath(".//br")[0].tail.strip() image = image.attrib['src'] # Fallback if we don't get one from the # homepage. homepage = name.attrib['href'] name = clean_name(name.text) info = self.scrape_homepage(homepage) if info.get('image', None): image = info['image'] p = Legislator(name=name, post_id=role, image=image, biography=info['bio']) p.add_link(homepage, 'homepage') p.add_source(homepage) p.add_source(MEMBER_LIST) yield p
def nyc_scrape_people(self): page = self.lxmlize(MEMBER_PAGE) for entry in page.xpath("//table[@id='members_table']//tr"): entries = entry.xpath(".//td") if entries == []: continue name, district, borough, party = entries name = name.xpath(".//a")[0] homepage = name.attrib['href'] name, district, borough, party = [x.text for x in [name, district, borough, party]] info = self.scrape_homepage(homepage) p = Legislator(name=name, post_id=district, # borough=borough, party=party.strip() or "other") p.add_link(homepage, 'homepage') p.add_source(homepage) p.add_source(MEMBER_PAGE) yield p
def nyc_scrape_people(self): page = self.lxmlize(MEMBER_PAGE) for entry in page.xpath("//table[@id='members_table']//tr"): entries = entry.xpath(".//td") if entries == []: continue name, district, borough, party = entries name = name.xpath(".//a")[0] homepage = name.attrib['href'] name, district, borough, party = [ x.text for x in [name, district, borough, party] ] info = self.scrape_homepage(homepage) p = Legislator( name=name, post_id=district, # borough=borough, party=party.strip() or "other") p.add_link(homepage, 'homepage') p.add_source(homepage) p.add_source(MEMBER_PAGE) yield p
def scrape_councilor(self, url): page = self.lxmlize(url) info = page.xpath("//div[@class='main']")[0] name = info.xpath("//h3")[1].text_content().replace('Councillor','').strip() district = info.xpath("//p")[0].text_content() p = Legislator(name=name, district=district) info = info.xpath("//div[@class='last']")[0] # add links p.add_source(url) p.add_source(COUNCIL_PAGE) if "website:" in info.text_content(): p.add_link(info.xpath('.//a')[1].attrib['href'], 'homepage') if "Facebook" in info.text_content(): p.add_link(info.xpath('//a[contains(@href, "facebook.com")]')[0].attrib['href'],'facebook') if "Twitter" in info.text_content(): p.add_link(info.xpath('//a[contains(@href,"twitter.com")]')[0].attrib['href'],'twitter') # add contact info p.add_contact('email', info.xpath('.//a')[0].text_content(),'') #//*[@id="content"]/div/div[1]/div[2]/p[1] contacts = info.xpath('//div/p[text()[contains(.,"Phone:")]]') for contact in contacts: note = contact.xpath('.//strong')[0].text_content() contact = contact.xpath('br/following-sibling::node()') if len(contact) > 8 : continue if len(contact) >= 4: address = (contact[0]+", "+contact[2]).strip() p.add_contact('address',address,note) if "Phone: " in contact[4]: phone = contact[4].replace("Phone: ",'').strip() p.add_contact('phone',phone,note) if len(contact) > 5 and "Fax:" in contact[6]: fax = contact[6].replace("Fax: ",'').strip() p.add_contact('fax',fax,note) else: phone = contact[0].strip() p.add_contact('phone',phone,note) fax = contact[2].strip() p.add_contact('fax',fax,note)