예제 #1
0
    def bos_scrape_people(self):
        page = self.lxmlize(MEMBER_LIST)
        people = page.xpath(
            "//table[@width='100%']//td[@style='TEXT-ALIGN: center']")

        for person in people:
            image, name = [
                self.get_one(person, x) for x in [
                    ".//img",
                    ".//a[contains(@href, 'councillors') and (text()!='')]"
                ]
            ]
            role = person.xpath(".//br")[0].tail.strip()
            image = image.attrib[
                'src']  # Fallback if we don't get one from the
            # homepage.
            homepage = name.attrib['href']
            name = clean_name(name.text)
            info = self.scrape_homepage(homepage)
            if info.get('image', None):
                image = info['image']

            p = Legislator(name=name,
                           post_id=role,
                           image=image,
                           biography=info['bio'])
            p.add_link(homepage, 'homepage')
            p.add_source(homepage)
            p.add_source(MEMBER_LIST)
            yield p
예제 #2
0
    def bos_scrape_people(self):
        page = self.lxmlize(MEMBER_LIST)
        people = page.xpath(
            "//table[@width='100%']//td[@style='TEXT-ALIGN: center']")

        for person in people:
            image, name = [self.get_one(person, x) for x in [
                ".//img",
                ".//a[contains(@href, 'councillors') and (text()!='')]"
            ]]
            role = person.xpath(".//br")[0].tail.strip()
            image = image.attrib['src']  # Fallback if we don't get one from the
            # homepage.
            homepage = name.attrib['href']
            name = clean_name(name.text)
            info = self.scrape_homepage(homepage)
            if info.get('image', None):
                image = info['image']

            p = Legislator(name=name,
                           post_id=role,
                           image=image,
                           biography=info['bio'])
            p.add_link(homepage, 'homepage')
            p.add_source(homepage)
            p.add_source(MEMBER_LIST)
            yield p
예제 #3
0
    def nyc_scrape_people(self):
        page = self.lxmlize(MEMBER_PAGE)
        for entry in page.xpath("//table[@id='members_table']//tr"):
            entries = entry.xpath(".//td")
            if entries == []:
                continue

            name, district, borough, party = entries
            name = name.xpath(".//a")[0]
            homepage = name.attrib['href']
            name, district, borough, party = [x.text for x in
                                              [name, district, borough, party]]

            info = self.scrape_homepage(homepage)
            p = Legislator(name=name,
                           post_id=district,
                           # borough=borough,
                           party=party.strip() or "other")
            p.add_link(homepage, 'homepage')
            p.add_source(homepage)
            p.add_source(MEMBER_PAGE)
            yield p
예제 #4
0
    def nyc_scrape_people(self):
        page = self.lxmlize(MEMBER_PAGE)
        for entry in page.xpath("//table[@id='members_table']//tr"):
            entries = entry.xpath(".//td")
            if entries == []:
                continue

            name, district, borough, party = entries
            name = name.xpath(".//a")[0]
            homepage = name.attrib['href']
            name, district, borough, party = [
                x.text for x in [name, district, borough, party]
            ]

            info = self.scrape_homepage(homepage)
            p = Legislator(
                name=name,
                post_id=district,
                # borough=borough,
                party=party.strip() or "other")
            p.add_link(homepage, 'homepage')
            p.add_source(homepage)
            p.add_source(MEMBER_PAGE)
            yield p
예제 #5
0
  def scrape_councilor(self, url):
    page = self.lxmlize(url)
    info = page.xpath("//div[@class='main']")[0]
    name = info.xpath("//h3")[1].text_content().replace('Councillor','').strip()
    district = info.xpath("//p")[0].text_content()
    p = Legislator(name=name, district=district)
    
    info = info.xpath("//div[@class='last']")[0]

    # add links
    p.add_source(url)
    p.add_source(COUNCIL_PAGE)
     
    if "website:" in info.text_content():
      p.add_link(info.xpath('.//a')[1].attrib['href'], 'homepage')

    if "Facebook" in info.text_content():
      p.add_link(info.xpath('//a[contains(@href, "facebook.com")]')[0].attrib['href'],'facebook')
   
    if "Twitter" in info.text_content():
      p.add_link(info.xpath('//a[contains(@href,"twitter.com")]')[0].attrib['href'],'twitter') 
    
    # add contact info
    p.add_contact('email', info.xpath('.//a')[0].text_content(),'')
   #//*[@id="content"]/div/div[1]/div[2]/p[1]
    contacts = info.xpath('//div/p[text()[contains(.,"Phone:")]]')
    for contact in contacts:
      note = contact.xpath('.//strong')[0].text_content()
      contact = contact.xpath('br/following-sibling::node()')
      if len(contact) > 8 : continue
      if len(contact) >= 4:
        address = (contact[0]+", "+contact[2]).strip()
        p.add_contact('address',address,note)
        if "Phone: " in contact[4]: 
          phone = contact[4].replace("Phone: ",'').strip()
          p.add_contact('phone',phone,note)
        if len(contact) > 5 and "Fax:" in contact[6]: 
          fax = contact[6].replace("Fax: ",'').strip()
          p.add_contact('fax',fax,note) 
      else: 
        phone = contact[0].strip()
        p.add_contact('phone',phone,note)
        fax = contact[2].strip()
        p.add_contact('fax',fax,note)