Python CanadianLegislator.image示例，utils.CanadianLegislator.image Python示例

示例#1

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//h1[@class="title"]')
    for councillor in councillors:
      if not ',' in councillor.text_content():
        continue
      name, district = councillor.text_content().split(',')
      name = name.strip()
      if 'Mayor' in district:
        p = Legislator(name=name, post_id='Beaconsfield', role='Maire')
        p.add_source(COUNCIL_PAGE)
        p.image = councillor.xpath('./parent::div/parent::div/p//img/@src')[0]
        phone = councillor.xpath('.//parent::div/following-sibling::div[contains(text(), "514")]/text()')[0]
        phone = phone.split(':')[1].strip().replace(' ', '-')
        p.add_contact('voice', phone, 'legislature')
        script = councillor.xpath('.//parent::div/following-sibling::div/script')[0].text_content()
        p.add_contact('email', get_email(script), None)
        yield p
        continue

      district = district.split('-')[1].strip()
      p = Legislator(name=name, post_id=district, role='Conseiller')
      p.add_source(COUNCIL_PAGE)

      p.image = councillor.xpath('./parent::div/parent::div/p//img/@src')[0]

      phone = councillor.xpath('.//parent::div/following-sibling::p[contains(text(), "514")]/text()')
      if phone:
        phone = phone[0]
        phone = phone.split(':')[1].strip().replace(' ', '-')
        p.add_contact('voice', phone, 'legislature')
      script = councillor.xpath('.//parent::div/following-sibling::p/script')[0].text_content()
      p.add_contact('email', get_email(script), None)
      yield p

示例#2

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillor_trs = [tr for tr in page.xpath('//table//tr[1]') if len(tr) == 2][:-1]
    for councillor_tr in councillor_trs:
      desc = [text.strip() for text in councillor_tr.xpath('.//text()[normalize-space()]') if text.strip()]

      if len(desc) == 3:
        role = 'Maire'
        district = u'Saint-Jérôme'
      else:
        role = 'Conseiller'
        district = desc[0].replace(u'numéro ', '')

      name = desc[-3]
      phone = desc[-2]
      email = desc[-1]

      image = councillor_tr.xpath('string(.//img/@src)')[0]
      
      p = Legislator(name=name, post_id=district, role=role)
      p.add_source(COUNCIL_PAGE)
      p.image = image
      p.add_contact('voice', phone, 'legislature')
      p.add_contact('email', email, None)
      yield p

示例#3

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        councillor_trs = [
            tr for tr in page.xpath('//table//tr[1]') if len(tr) == 2
        ][:-1]
        for councillor_tr in councillor_trs:
            desc = [
                text.strip()
                for text in councillor_tr.xpath('.//text()[normalize-space()]')
                if text.strip()
            ]

            if len(desc) == 3:
                role = 'Maire'
                district = u'Saint-Jérôme'
            else:
                role = 'Conseiller'
                district = desc[0].replace(u'numéro ', '')

            name = desc[-3]
            phone = desc[-2]
            email = desc[-1]

            image = councillor_tr.xpath('string(.//img/@src)')[0]

            p = Legislator(name=name, post_id=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.image = image
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('email', email, None)
            yield p

示例#4

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):

    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//p[@class="WSIndent"]/a')
    for councillor in councillors:
      district = re.findall(r'(Ward [0-9]{1,2})', councillor.text_content())
      if district:
        district = district[0]
        name = councillor.text_content().replace(district, '').strip()
        role = 'Councillor'
      else:
        district = 'Kawartha Lakes'
        name = councillor.text_content().replace('Mayor', '').strip()
        role = 'Mayor'

      url = councillor.attrib['href']
      page = lxmlize(url)
      email = page.xpath('//a[contains(@href, "mailto:")]/@href')[0].rsplit(':', 1)[1].strip()
      image = page.xpath('//img[@class="image-right"]/@src')[0]

      p = Legislator(name=name, post_id=district, role=role)
      p.add_source(COUNCIL_PAGE)
      p.add_source(url)
      p.add_contact('email', email, None)
      p.image = image
      yield p

示例#5

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//div[@align="center" and not(@class="background")]//td/p')
    for councillor in councillors:
      if not councillor.text_content().strip():
        continue
      name = councillor.xpath('./font/b/text()')
      if not name:
        name = councillor.xpath('./font/text()')
      if 'e-mail' in name[0]:
        name = councillor.xpath('./b/font/text()')
      name = name[0]
      role = 'Councillor'
      if 'Mayor' in name:
        name = name.replace('Mayor', '')
        role = 'Mayor'

      p = Legislator(name=name, post_id="LaSalle", role=role)
      p.add_source(COUNCIL_PAGE)
      
      photo_url = councillor.xpath('./parent::td//img/@src')[0]
      p.image = photo_url

      email = councillor.xpath('.//a[contains(@href, "mailto:")]/text()')[0]
      p.add_contact('email', email, None)

      phone = re.findall(r'(?<=phone:)(.*)(?=home)', councillor.text_content(), flags=re.DOTALL)
      if phone:
        p.add_contact('voice', phone[0].strip(), 'legislature')

      home_phone = re.findall(r'(?<=home phone:)(.*)', councillor.text_content(), flags=re.DOTALL)[0]
      p.add_contact('voice', home_phone.strip(), 'residence')
      yield p

示例#6

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//table/tbody/tr/td')
    for councillor in councillors:
      text = councillor.xpath('.//strong/text()')[0]
      name = text.split(',')[0].replace('Name:', '').strip()
      if 'Mayor' in text and not 'Deputy Mayor' in text:
        role = 'Mayor'
        district = 'Fredericton'
      else:
        district = re.findall(r'(Ward:.*)(?=Address:)', councillor.text_content())[0].replace(':', '').strip()
        district = re.search('\((.+?)(?: Area)?\)', district).group(1)
        role = 'Councillor'

      p = Legislator(name=name, post_id=district, role=role)
      p.add_source(COUNCIL_PAGE)

      p.image = councillor.xpath('.//img/@src')[0]

      address = re.findall(r'(?<=Address:).*(?=Home:)', councillor.text_content())[0].strip()
      p.add_contact('address', address, 'legislature')

      phone = re.findall(r'(?<=Home: \().*(?=Fax:)', councillor.text_content())[0]
      phone = re.sub(r'(?<=[0-9])(\)\D{1,2})(?=[0-9])', '-', phone).split()[0]
      p.add_contact('voice', phone, 'residence')

      phone = re.findall(r'(?<=Office: \().*(?=Fax:)', councillor.text_content())
      if phone:
        phone = phone[0].replace(') ', '-')
        p.add_contact('voice', phone, 'legislature')

      yield p

示例#7

0

显示文件

def scrape_mayor(url):
    page = lxmlize(url)
    name = page.xpath('//tr/td/p')[-1]
    name = name.text_content().replace('Mayor', '')
    image = page.xpath('//div[@class="sask_ArticleBody"]//img/@src')[0]

    contact_url = page.xpath(
        '//a[contains(text(), "Contact the Mayor")]/@href')[0]
    page = lxmlize(contact_url)

    address = ' '.join(
        page.xpath(
            '//div[@id="ctl00_PlaceHolderMain_RichHtmlField1__ControlWrapper_RichHtmlField"]/p[4]/text()'
        )[1:])
    phone = page.xpath(
        '//div[@id="ctl00_PlaceHolderMain_RichHtmlField1__ControlWrapper_RichHtmlField"]/p[5]/span/text()'
    )[0].replace('(', '').replace(') ', '-')
    fax = page.xpath(
        '//div[@id="ctl00_PlaceHolderMain_RichHtmlField1__ControlWrapper_RichHtmlField"]/p[6]/span/text()'
    )[0].replace('(', '').replace(') ', '-')

    p = Legislator(name=name, post_id='Saskatoon', role='Mayor')
    p.add_source(url)
    p.image = image
    p.add_contact('address', address, 'legislature')
    p.add_contact('voice', phone, 'legislature')
    p.add_contact('fax', fax, 'legislature')
    return p

示例#8

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//ul[@class="subNav top"]/li/ul//li/a')
    for councillor in councillors:
      name = councillor.text_content()

      url = councillor.attrib['href']
      page = lxmlize(url)

      if councillor == councillors[0]:
        district = 'Ajax'
        role = 'Mayor'
      else:
        district = re.findall(r'Ward.*', page.xpath('//div[@id="printAreaContent"]//h1')[0].text_content())[0].strip()
        role = page.xpath('//div[@id="printAreaContent"]//h1')[0].text_content()
        role = re.findall('((Regional)? ?(Councillor))', role)[0][0]

      p = Legislator(name=name, post_id=district, role=role)
      p.add_source(COUNCIL_PAGE)
      p.add_source(url)

      p.image = page.xpath('//div[@class="intQuicklinksPhoto"]/img/@src')[0]

      contact_info = page.xpath('//table[@class="datatable"][1]//tr')[1:]
      for line in contact_info:
        contact_type = line.xpath('./td')[0].text_content().strip()
        contact = line.xpath('./td')[1].text_content().strip()
        if re.match(r'(Phone)|(Fax)|(Email)', contact_type):
          contact_type = CONTACT_DETAIL_TYPE_MAP[contact_type]
          p.add_contact(contact_type, contact, None if contact_type == 'email' else 'legislature')
        else:
          p.add_link(contact, None)
      yield p

示例#9

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

    def scrape_councilor(self, page, h1, url):
        name = h1.split('Councillor')[1]
        ward_full = page.xpath('string(//strong[not(@class)])').replace(
            u'\xa0', u' ')
        ward_num, ward_name = re.search(r'(Ward \d+) (.+)', ward_full).groups()

        p = Legislator(name=name, post_id=ward_num, role='Councillor')
        p.add_source(COUNCIL_PAGE)
        p.add_source(url)

        p.image = page.xpath('string(//main//img/@src)')
        email = page.xpath('string((//a[contains(@href, "@")])[1])')
        p.add_contact('email', email, None)

        addr_cell = page.xpath('//*[contains(text(), "Toronto City Hall")]/'
                               'ancestor::td')[0]
        phone = (addr_cell.xpath(
            'string((.//text()[contains(., "Phone:")])[1])').split(':')[1])
        p.add_contact('voice', phone, 'legislature')

        address = '\n'.join(addr_cell.xpath('./p[2]/text()')[:2])
        if address:
            p.add_contact('address', address, 'legislature')

        return p

示例#10

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//div[@class="article-content"]//td[@class="ms-rteTableOddCol-0"]')
    yield scrape_mayor(councillors[0])
    for councillor in councillors[1:]:
      if not councillor.xpath('.//a'):
        continue

      name = councillor.xpath('.//a')[0].text_content().strip()
      district = councillor.xpath('.//a')[1].text_content()
      url = councillor.xpath('.//a/@href')[0]
      page = lxmlize(url)

      p = Legislator(name=name, post_id=district, role='Conseiller')
      p.add_source(COUNCIL_PAGE)
      p.add_source(url)

      p.image = councillor.xpath('./preceding-sibling::td//img/@src')[-1]

      contacts = page.xpath('.//td[@class="ms-rteTableOddCol-0"]//text()')
      for contact in contacts:
        if re.findall(r'[0-9]', contact):
          phone = contact.strip().replace(' ', '-')
          p.add_contact('voice', phone, 'legislature')
      get_links(p, page.xpath('.//td[@class="ms-rteTableOddCol-0"]')[0])

      email = page.xpath(
        'string(//a[contains(@href, "mailto:")]/@href)')[len('mailto:'):]
      p.add_contact('email', email, None)
      yield p

示例#11

0

显示文件

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    mayor = page.xpath('.//div[@class="item-page clearfix"]//table[1]//p')[1]
    name = mayor.xpath('.//strong/text()')[0]

    p = Legislator(name=name, post_id='Pointe-Claire', role='Maire')
    p.add_source(COUNCIL_PAGE)

    phone = re.findall(r'[0-9]{3}[ -][0-9]{3}-[0-9]{4}', mayor.text_content())[0].replace(' ', '-')
    p.add_contact('voice', phone, 'legislature')
    yield p

    rows = page.xpath('//tr')
    for i, row in enumerate(rows):
      if i % 2 == 0:
        continue
      councillors = row.xpath('./td')
      for j, councillor in enumerate(councillors):
        name = councillor.text_content()
        # rows[i + 1].xpath('.//td//a[contains(@href, "maps")]/text()')[j] # district number
        district = rows[i + 1].xpath('.//td/p[1]/text()')[j].replace(' / ', '/')

        p = Legislator(name=name, post_id=district, role='Conseiller')
        p.add_source(COUNCIL_PAGE)
        p.image = councillor.xpath('.//img/@src')[0]

        phone = re.findall(r'[0-9]{3}[ -][0-9]{3}-[0-9]{4}', rows[i + 1].xpath('.//td')[j].text_content())[0].replace(' ', '-')

        p.add_contact('voice', phone, 'legislature')

        yield p

示例#12

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

    def scrape_mayor(self, url):
        page = lxmlize(url)
        name = page.xpath("//h1/text()")[0].replace("Toronto Mayor",
                                                    "").strip()

        p = Legislator(name, post_id="Toronto", role='Mayor')
        p.add_source(COUNCIL_PAGE)
        p.add_source(url)

        p.image = page.xpath('string(//article/img/@src)')

        url = page.xpath(
            '//a[contains(text(), "Contact the Mayor")]')[0].attrib['href']
        url = url.replace(
            'www.', 'www1.'
        )  # @todo fix lxmlize to use the redirected URL to make links absolute
        p.add_source(url)
        page = lxmlize(url)

        mail_elem, phone_elem = page.xpath('//h3')[:2]
        address = ''.join(mail_elem.xpath('./following-sibling::p//text()'))
        phone = phone_elem.xpath('string(./following-sibling::p[1])')

        p.add_contact('address', address, 'legislature')
        p.add_contact('voice', phone, 'legislature')
        return p

示例#13

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

def scrape_mayor(url):
    page = lxmlize(url)
    name = ' '.join(
        page.xpath('//div[@id="content"]/p[2]/text()')[0].split()[1:3])

    p = Legislator(name=name, post_id='Moncton', role='Mayor')
    p.add_source(url)

    p.image = page.xpath('//div[@id="content"]/p[1]/img/@src')[0]

    info = page.xpath('//table[@class="whiteroundedbox"]//tr[2]/td[1]')[1]
    address = ', '.join(info.xpath('./p[1]/text()')[1:4])
    address = re.sub(r'\s{2,}', ' ', address).strip()
    phone = info.xpath('.//p[2]/text()')[0].split(':')[1].strip()
    fax = info.xpath('.//p[2]/text()')[1].split(':')[1].strip()
    email = info.xpath('.//a/@href')[0].split(':')[1].strip()

    p.add_contact('address', address, 'legislature')
    if len(re.sub(r'\D', '', phone)) == 7:
        phone = '506-%s' % phone
    p.add_contact('voice', phone, 'legislature')
    p.add_contact('fax', fax, 'legislature')
    p.add_contact('email', email, None)

    return p

示例#14

0

显示文件

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE, 'iso-8859-1')

    councillors = page.xpath('//div[@id="PageContent"]/table/tbody/tr/td')
    for councillor in councillors:
      if not councillor.text_content().strip():
        continue
      if councillor == councillors[0]:
        district = 'Kirkland'
        role = 'Maire'
      else:
        district = councillor.xpath('.//h2')[0].text_content()
        district = re.search('- (.+)', district).group(1).strip()
        district = district.replace(' Ouest', ' ouest').replace(' Est', ' est')
        role = 'Conseiller'

      name = councillor.xpath('.//strong/text()')[0]

      phone = councillor.xpath('.//div[contains(text(), "#")]/text()')[0].replace('T ', '').replace(' ', '-').replace(',-#-', ' x')
      email = councillor.xpath('.//a[contains(@href, "mailto:")]')[0].text_content()

      p = Legislator(name=name, post_id=district, role=role)
      p.add_source(COUNCIL_PAGE)
      p.add_contact('voice', phone, 'legislature')
      p.add_contact('email', email, None)
      p.image = councillor.xpath('.//img/@src')[0]
      yield p

示例#15

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE, user_agent='Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)')

    yield self.scrape_mayor(page)

    councillors = page.xpath('//strong[contains(text(), "Councillor")]/parent::p|//b[contains(text(), "Councillor")]/parent::p')
    for councillor in councillors:

      name = councillor.xpath('./strong/text()|./b/text()')[0].replace('Councillor', '').strip()
      district = re.findall('(?<=Ward \d, ).*', councillor.text_content())[0].strip()

      p = Legislator(name=name, post_id=district, role='Councillor')
      p.add_source(COUNCIL_PAGE)

      p.image = councillor.xpath('.//img/@src')[0]

      phone = re.findall(r'Phone(.*)', councillor.text_content())
      node = councillor
      while not phone:
        node = node.xpath('./following-sibling::p')[1]
        phone = re.findall(r'Phone(.*)', node.text_content())
      phone = phone[0].strip()

      email = councillor.xpath('.//a[contains(@href, "mailto:")]')
      if not email:
        email = councillor.xpath('./following-sibling::p//a[contains(@href, "mailto")]')
      email = email[0].text_content()

      if len(re.sub(r'\D', '', phone)) == 7:
        phone = '902-%s' % phone
      p.add_contact('voice', phone, 'legislature')
      p.add_contact('email', email, None)

      yield p

示例#16

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):
    yield mayor_info(MAYOR_PAGE)

    page = lxmlize(COUNCIL_PAGE)
    councillors = page.xpath('//div[@id="news"]//p')
    for councillor in councillors:
      district = councillor.xpath('./b')[0].text_content()
      district = re.findall(u'(?:W|R).*', district)[0]
      role = 'Councillor'
      if 'Regional' in district:
        district = 'Cambridge'
        role = 'Regional Councillor'
      name = councillor.xpath('.//a')[0].text_content()

      url = councillor.xpath('.//a')[0].attrib['href']
      page = lxmlize(url)

      image = page.xpath('//img[contains(@src, "councilImages")]/@src')[0]
      address = page.xpath('//*[contains(text(),"Address")]/ancestor::td')[-1].text_content().split(':')[-1].replace("\t", '')
      phone = page.xpath('//*[contains(text(),"Tel")]/ancestor::td')[-1].text_content().split(':')[-1].replace("\t", '')
      phone = phone.replace('(', '').replace(') ', '-')
      if page.xpath('//*[contains(text(),"Fax")]'):
        fax = page.xpath('//*[contains(text(),"Fax")]/ancestor::td')[-1].text_content().split(':')[-1].replace("\t", '')
        fax = fax.replace('(', '').replace(') ', '-')
      email = page.xpath('//a[contains(@href,"mailto:")]')[0].text_content()

      p = Legislator(name=name, post_id=district, role=role)
      p.add_source(COUNCIL_PAGE)
      p.add_source(url)
      p.add_contact('address', address, 'legislature')
      p.add_contact('voice', phone, 'legislature')
      p.add_contact('fax', fax, 'legislature')
      p.add_contact('email', email, None)
      p.image = image
      yield p

示例#17

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//div[@id="printArea"]//strong')
    for councillor in councillors:
      info = councillor.xpath('./parent::p/text()')
      if not info:
        info = councillor.xpath('./parent::div/text()')
      info = [x for x in info if x.strip()]
      district = re.sub('(?<=Ward \d).+', '', info.pop(0))
      if 'Mayor' in district:
        district = 'Woolwich'
        role = 'Mayor'
      else:
        district = district.replace('Councillor', '').strip()
        role = 'Councillor'

      p = Legislator(name=councillor.text_content(), post_id=district, role=role)
      p.add_source(COUNCIL_PAGE)
      p.image = councillor.xpath('./img/@src')[0]

      for contact in info:
        note, num = contact.split(':')
        num = num.strip().replace('(', '').replace(') ', '-').replace('extension ', 'x')
        p.add_contact(note, num, note)
      yield p

示例#18

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//div[@id="subnav"]//a')
    for councillor in councillors:
      name = councillor.xpath('./span/text()')[0].strip()
      district = councillor.xpath('.//strong')[0].text_content()

      url = councillor.attrib['href']

      if councillor == councillors[0]:
        yield self.scrape_mayor(name, url)
        continue

      page = lxmlize(url)

      address = page.xpath('//div[@id="content"]//p[contains(text(),"City of Burlington,")]')
      contact = page.xpath('//div[@id="subnav"]//p[contains(text(),"Phone")]')[0]
      phone = re.findall(r'Phone: (.*)', contact.text_content())[0].replace('Ext. ', 'x').replace('#', 'x')
      fax = re.findall(r'Fax: (.*)', contact.text_content())[0]
      email = contact.xpath('//a[contains(@href, "mailto:")]')[0].text_content()

      p = Legislator(name=name, post_id=district, role='Councillor')
      p.add_source(COUNCIL_PAGE)
      p.add_source(url)

      p.image = page.xpath('//div[@id="subnav"]//img/@src')[0]

      if address:
        p.add_contact('address', address[0].text_content(), 'legislature')
      p.add_contact('voice', phone, 'legislature')
      p.add_contact('fax', fax, 'legislature')
      p.add_contact('email', email, None)

      yield p

示例#19

0

显示文件

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//div[@id="c2087"]//a')
    for councillor in councillors:
      name = councillor.text_content()
      url = councillor.attrib['href']
      page = lxmlize(url)
      if 'Maire' in page.xpath('//h2/text()')[0]:
        district = 'Sherbrooke'
        role = 'Maire'
      else:
        district = page.xpath('//div[@class="csc-default"]//a[@target="_blank"]/text()')[0].replace('district', '').replace('Domaine Howard', 'Domaine-Howard').strip()
        role = 'Conseiller'
      if district in ('de Brompton', 'de Lennoxville'):
        district = district.replace('de ', '')
      p = Legislator(name=name, post_id=district, role=role)
      p.add_source(COUNCIL_PAGE)
      p.add_source(url)
      p.image = page.xpath('//div[@class="csc-textpic-image csc-textpic-last"]//img/@src')[0]
      parts = page.xpath('//li[contains(text(), "phone")]/text()')[0].split(':')
      note = parts[0]
      phone = parts[1]
      p.add_contact(note, phone, note)
      email = page.xpath('//a[contains(@href, "mailto:")]/@href')
      if email:
        email = email[0].split(':')[1]
        p.add_contact('email', email, None)
      if district == 'Brompton':
        p.add_extra('boundary_url', '/boundaries/sherbrooke-boroughs/brompton/')
      elif district == 'Lennoxville':
        p.add_extra('boundary_url', '/boundaries/sherbrooke-boroughs/lennoxville/')
      yield p

示例#20

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def scrape_mayor(self, name, url):
    page = lxmlize(url)

    contact = page.xpath('//div[@id="secondary align_RightSideBar"]/blockquote/p/text()')
    phone = contact[0]
    fax = contact[1]
    email = page.xpath('//div[@id="secondary align_RightSideBar"]/blockquote/p/a[contains(@href, "mailto:")]/text()')[0]

    mayor_page = lxmlize('http://www.burlingtonmayor.com')
    contact_url = mayor_page.xpath('//div[@class="menu"]//a[contains(text(),"Contact")]')[0].attrib['href']
    mayor_page = lxmlize(contact_url)
    address = mayor_page.xpath('//div[@class="entry-content"]//p[contains(text(),"City Hall")]')[0].text_content()

    p = Legislator(name=name, post_id="Burlington", role='Mayor')
    p.add_source(COUNCIL_PAGE)
    p.add_source(url)
    p.add_source('http://www.burlingtonmayor.com')

    p.image = page.xpath('//div[@id="secondary align_RightSideBar"]/p/img/@src')[0]
    p.add_contact('voice', phone, 'legislature')
    p.add_contact('fax', fax, 'legislature')
    p.add_contact('email', email, None)
    p.add_contact('address', address, 'legislature')

    return p

示例#21

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

def councillor_data(url):
  page = lxmlize(url)

  name = page.xpath('string(//h1[@id="TitleOfPage"])')
  district = page.xpath('string(//h2)')

  # TODO: Councillor emails are built with JS to prevent scraping, but the JS can be scraped.

  address = page.xpath('string(//div[@class="asideContent"])')

  photo = page.xpath('string(//div[@id="contentright"]//img[1]/@src)')
  phone = get_phone_data(page)

  js = page.xpath('string(//span/script)')
  email = email_js(js)

  p = Legislator(name=name, post_id=district, role='Councillor')
  p.add_source(COUNCIL_PAGE)
  p.add_source(url)
  p.add_contact('address', address, 'legislature')
  p.add_contact('voice', phone, 'legislature')
  p.add_contact('email', email, None)
  p.image = photo

  return p

示例#22

0

显示文件

文件： people.py 项目： rhymeswithcycle/scrapers-ca

    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@id="WebPartWPQ1"]/table/tbody/tr[1]')
        for councillor in councillors:
            node = councillor.xpath(".//td[1]//strong//strong//strong//strong") or councillor.xpath(".//td[1]//strong")
            text = node[0].text_content()
            name = text.strip().replace("Deputy ", "").replace("Warden ", "").replace("Mayor", "")
            role = text.replace(name, "").strip()
            if not role:
                role = "Councillor"
            if "," in name:
                name = name.split(",")[0].strip()
            district = councillor.xpath('.//td[1]//p[contains(text(),",")]/text()')[0].split(",")[1].strip()
            district = re.sub(r"\A(?:City|Municipality|Town|Township|Village) of\b| Township\Z", "", district)

            p = Legislator(name=name, post_id=district, role=role)
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath(".//td[1]//img/@src")[0]

            info = councillor.xpath(".//td[2]")[0].text_content()
            residential_info = re.findall(r"(?<=Residence:)(.*)(?=Municipal Office:)", info, flags=re.DOTALL)[0]
            self.get_contacts(residential_info, "residence", p)
            municipal_info = re.findall(r"(?<=Municipal Office:)(.*)", info, flags=re.DOTALL)[0]
            self.get_contacts(municipal_info, "legislature", p)

            yield p

示例#23

0

显示文件

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    mayor_url = page.xpath('//a[contains(text(), "Mayor")]/@href')[0]
    yield self.scrape_mayor(mayor_url)

    councillors_url = page.xpath('//a[contains(text(), "Councillors")]/@href')[0]
    cpage = lxmlize(councillors_url)

    councillor_rows = cpage.xpath('//tr[td//img]')[:-1]
    for councillor_row in councillor_rows:
      img_cell, info_cell = tuple(councillor_row)
      name = info_cell.xpath(
         'string(.//span[contains(text(), "Councillor")])')[len('Councillor '):]
      district = info_cell.xpath('string(.//p[contains(text(), "District")])')
      email = info_cell.xpath('string(.//a[contains(@href, "mailto:")])')
      if not email:
        email = info_cell.xpath('string(.//strong[contains(text(), "E-mail")]/following-sibling::text())')
      phone = info_cell.xpath(
          'string(.//p[contains(.//text(), "Telephone:")])').split(':')[1]
      img_url_rel = img_cell.xpath('string(//img/@href)')
      img_url = urljoin(councillors_url, img_url_rel)

      p = Legislator(name=name, post_id=district, role='Conseiller')
      p.add_source(COUNCIL_PAGE)
      p.add_source(councillors_url)
      p.add_contact('email', email, None)
      p.add_contact('voice', phone, 'legislature')
      p.image = img_url
      yield p

示例#24

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

def councillor_data(url):
    page = lxmlize(url)

    name = page.xpath('string(//h1[@id="TitleOfPage"])')
    district = page.xpath('string(//h2)')

    # TODO: Councillor emails are built with JS to prevent scraping, but the JS can be scraped.

    address = page.xpath('string(//div[@class="asideContent"])')

    photo = page.xpath('string(//div[@id="contentright"]//img[1]/@src)')
    phone = get_phone_data(page)

    js = page.xpath('string(//span/script)')
    email = email_js(js)

    p = Legislator(name=name, post_id=district, role='Councillor')
    p.add_source(COUNCIL_PAGE)
    p.add_source(url)
    p.add_contact('address', address, 'legislature')
    p.add_contact('voice', phone, 'legislature')
    p.add_contact('email', email, None)
    p.image = photo

    return p

示例#25

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

    def scrape_mayor(self):
        page = lxmlize(MAYOR_PAGE, 'iso-8859-1')

        name = page.xpath(
            '//div[@class="articletitle"]/h1')[0].text_content().replace(
                'Mayor', '')

        p = Legislator(name=name, post_id='Summerside', role='Mayor')
        p.add_source(MAYOR_PAGE)
        p.image = page.xpath(
            '//div[@class="articlebody-inside"]/p/img/@src')[0].replace(
                '..', '')

        info = page.xpath('//div[@class="articlebody-inside"]/p')
        phone = re.findall(r'to (.*)', info[1].text_content())[0]
        address = info[3].text_content().replace(
            'by mail: ', '') + ' ' + info[4].text_content()
        email = info[5].xpath(
            './/a[contains(@href, "mailto:")]')[0].text_content()

        p.add_contact('voice', phone, 'legislature')
        p.add_contact('address', address, 'legislature')
        p.add_contact('email', email, None)

        return p

示例#26

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE, 'iso-8859-1')

    general_contacts = page.xpath('//p[@class="large_title"]/following-sibling::p/text()')
    general_phone = general_contacts[0]
    general_fax = general_contacts[1]

    councillors = page.xpath('//tr/td/p/strong')
    councillors = [councillor for councillor in councillors if not "@" in councillor.text_content()]
    for councillor in councillors:

      if 'Mayor' in councillor.text_content():
        name = councillor.text_content().replace('Mayor', '')
        district = 'Dollard-Des Ormeaux'
        role = 'Maire'
      else:
        name = re.split(r'[0-9]', councillor.text_content())[1]
        district = 'District ' + re.findall(r'[0-9]', councillor.text_content())[0]
        role = 'Conseiller'

      p = Legislator(name=name, post_id=district, role=role)
      p.add_source(COUNCIL_PAGE)
      p.image = councillor.xpath('./parent::p/parent::td/parent::tr/preceding-sibling::tr//img/@src')[0]

      email = councillor.xpath('./parent::p/following-sibling::p//a[contains(@href, "mailto:")]')
      if email:
        p.add_contact('email', email[0].text_content(), None)

      p.add_contact('voice', general_phone, 'legislature')
      p.add_contact('fax', general_fax, 'legislature')

      yield p

示例#27

0

显示文件

  def get_people(self):
      response = urlopen(COUNCIL_CSV_URL)
      cr = DictReader(response)
      for councillor in cr:
        name = '%s %s' % (councillor['First name'], councillor['Last name'])
        role = councillor['Elected office']
        if role == 'Mayor':
          district = 'Ottawa'
        else:
          district = councillor['District name']

        # Correct typos. The City has been notified of the errors.
        if district == u'Knoxdale Merivale':
          district = u'Knoxdale-Merivale'
        if district == u'Rideau Vanier':
          district = u'Rideau-Vanier'
        if district == u'Orleans':
          district = u'Orléans'

        email = councillor['Email']
        address = ', '.join([councillor['Address line 1'],
                             councillor['Address line 2'],
                             councillor['Locality'],
                             councillor['Postal code'],
                             councillor['Province']])
        phone = councillor['Phone']
        photo_url = councillor['Photo URL']

        p = Legislator(name=name, post_id=district, role=role)
        p.add_source(COUNCIL_CSV_URL)
        p.add_contact('email', email, None)
        p.add_contact('address', address, 'legislature')
        p.add_contact('voice', phone, 'legislature')
        p.image = photo_url
        yield p

示例#28

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

    def scrape_mayor(self, div):
        name = div.xpath('.//a')[0].text_content().replace('Mayor', '')
        url = div.xpath('.//a')[0].attrib['href']

        p = Legislator(name=name, post_id='Guelph', role='Mayor')
        p.add_source(COUNCIL_PAGE)
        p.add_source(url)

        phone = div.xpath('.//text()[normalize-space()]')[2]
        email = div.xpath('.//a[contains(@href,"mailto:")]')[0].text_content()

        page = lxmlize(url)

        p.add_contact('voice', phone, 'legislature')
        p.add_contact('email', email, None)
        p.add_link(
            page.xpath(
                '//div[@class="entry-content"]//a[contains(@href, "facebook")]'
            )[0].attrib['href'], None)
        p.add_link(
            page.xpath(
                '//div[@class="entry-content"]//a[contains(@href, "twitter")]')
            [0].attrib['href'], None)
        p.image = page.xpath('//header/img/@src')[0]

        return p

示例#29

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

    def scrape_mayor(self, name, url):
        page = lxmlize(url)

        contact = page.xpath(
            '//div[@id="secondary align_RightSideBar"]/blockquote/p/text()')
        phone = contact[0]
        fax = contact[1]
        email = page.xpath(
            '//div[@id="secondary align_RightSideBar"]/blockquote/p/a[contains(@href, "mailto:")]/text()'
        )[0]

        mayor_page = lxmlize('http://www.burlingtonmayor.com')
        contact_url = mayor_page.xpath(
            '//div[@class="menu"]//a[contains(text(),"Contact")]'
        )[0].attrib['href']
        mayor_page = lxmlize(contact_url)
        address = mayor_page.xpath(
            '//div[@class="entry-content"]//p[contains(text(),"City Hall")]'
        )[0].text_content()

        p = Legislator(name=name, post_id="Burlington", role='Mayor')
        p.add_source(COUNCIL_PAGE)
        p.add_source(url)
        p.add_source('http://www.burlingtonmayor.com')

        p.image = page.xpath(
            '//div[@id="secondary align_RightSideBar"]/p/img/@src')[0]
        p.add_contact('voice', phone, 'legislature')
        p.add_contact('fax', fax, 'legislature')
        p.add_contact('email', email, None)
        p.add_contact('address', address, 'legislature')

        return p

示例#30

0

显示文件

    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        # it's all javascript rendered on the client... wow.
        js = page.xpath(
            'string(//div[@class="inner_container"]/div/script[2])')
        districts = re.findall(r'arrayDistricts\[a.+"(.+)"', js)
        members = re.findall(r'arrayMembres\[a.+"(.+)"', js)
        urls = re.findall(r'arrayLiens\[a.+"(.+)"', js)
        # first item in list is mayor
        p = Legislator(name=members[0], post_id='Gatineau', role='Maire')
        p.add_source(COUNCIL_PAGE)
        mayor_page = lxmlize(MAYOR_CONTACT_PAGE)
        p.add_source(MAYOR_CONTACT_PAGE)
        email = '*****@*****.**'  # hardcoded
        p.add_contact('email', email, None)
        yield p

        for district, member, url in zip(districts, members, urls)[1:]:
            profile_url = COUNCIL_PAGE + '/' + url.split('/')[-1]
            profile_page = lxmlize(profile_url)
            photo_url = profile_page.xpath('string(//img/@src)')
            post_id = 'District ' + re.search('\d+', district).group(0)
            email = profile_page.xpath(
                'string(//a[contains(@href, "mailto:")]/@href)')[len('mailto:'
                                                                     ):]
            p = Legislator(name=member, post_id=post_id, role='Conseiller')
            p.add_source(COUNCIL_PAGE)
            p.add_source(profile_url)
            p.image = photo_url
            p.add_contact('email', email, None)
            yield p

示例#31

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//div[@id="navMultilevel"]//a')
    for councillor in councillors:
      if councillor == councillors[0]:
        yield self.scrape_mayor(councillor)
        continue

      if not '-' in councillor.text_content():
        break

      district, name = councillor.text_content().split(' - ')
      if name == 'Vacant':
        continue

      page = lxmlize(councillor.attrib['href'])

      address = page.xpath('//div[@class="column last"]//p')[0].text_content()
      phone = page.xpath('//article[@id="primary"]//*[contains(text(),"Tel")]')[0].text_content()
      phone = re.findall(r'([0-9].*)', phone)[0].replace(') ', '-')
      fax = page.xpath('//article[@id="primary"]//*[contains(text(),"Fax")]')[0].text_content()
      fax = re.findall(r'([0-9].*)', fax)[0].replace(') ', '-')
      email = page.xpath('//a[contains(@href, "mailto:")]')[0].text_content()

      p = Legislator(name=name, post_id=district, role='Councillor')
      p.add_source(COUNCIL_PAGE)
      p.add_source(councillor.attrib['href'])
      p.add_contact('address', address, 'legislature')
      p.add_contact('voice', phone, 'legislature')
      p.add_contact('fax', fax, 'legislature')
      p.add_contact('email', email, None)
      p.image = page.xpath('//article[@id="primary"]//img/@src')[1]
      yield p

示例#32

0

显示文件

    def get_people(self):

        page = lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//p[@class="WSIndent"]/a')
        for councillor in councillors:
            district = re.findall(r'(Ward [0-9]{1,2})',
                                  councillor.text_content())
            if district:
                district = district[0]
                name = councillor.text_content().replace(district, '').strip()
                role = 'Councillor'
            else:
                district = 'Kawartha Lakes'
                name = councillor.text_content().replace('Mayor', '').strip()
                role = 'Mayor'

            url = councillor.attrib['href']
            page = lxmlize(url)
            email = page.xpath(
                '//a[contains(@href, "mailto:")]/@href')[0].rsplit(
                    ':', 1)[1].strip()
            image = page.xpath('//img[@class="image-right"]/@src')[0]

            p = Legislator(name=name, post_id=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.add_contact('email', email, None)
            p.image = image
            yield p

示例#33

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    # it's all javascript rendered on the client... wow.
    js = page.xpath('string(//div[@class="inner_container"]/div/script[2])')
    districts = re.findall(r'arrayDistricts\[a.+"(.+)"', js)
    members = re.findall(r'arrayMembres\[a.+"(.+)"', js)
    urls = re.findall(r'arrayLiens\[a.+"(.+)"', js)
    # first item in list is mayor
    p = Legislator(name=members[0], post_id = 'Gatineau', role='Maire')
    p.add_source(COUNCIL_PAGE)
    mayor_page = lxmlize(MAYOR_CONTACT_PAGE)
    p.add_source(MAYOR_CONTACT_PAGE)
    email = '*****@*****.**' # hardcoded
    p.add_contact('email', email, None)
    yield p

    for district, member, url in zip(districts, members, urls)[1:]:
      profile_url = COUNCIL_PAGE + '/' + url.split('/')[-1]
      profile_page = lxmlize(profile_url)
      photo_url = profile_page.xpath('string(//img/@src)')
      post_id = 'District ' + re.search('\d+', district).group(0)
      email = profile_page.xpath(
          'string(//a[contains(@href, "mailto:")]/@href)')[len('mailto:'):]
      p = Legislator(name=member, post_id=post_id, role='Conseiller')
      p.add_source(COUNCIL_PAGE)
      p.add_source(profile_url)
      p.image = photo_url
      p.add_contact('email', email, None)
      yield p

示例#34

0

显示文件

    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        regions = page.xpath('//*[@id="contentIntleft"]//h3')[1:]
        for region in regions:
            #the links in all <p> tags immediately following each <h3>
            councillors = [
                elem[0]
                for elem in takewhile(lambda elem: elem.tag == 'p',
                                      region.xpath('./following-sibling::*'))
            ]
            for councillor in councillors:
                post = re.search('of (.*)', region.text).group(1)
                p = Legislator(name=councillor.text,
                               post_id=post,
                               role='Councillor')
                p.add_source(COUNCIL_PAGE)
                councillor_url = councillor.attrib['href']
                p.add_source(councillor_url)
                email, phone, address, photo_url = councillor_data(
                    councillor_url)
                p.add_contact('email', email, None)
                p.add_contact('voice', phone, 'legislature')
                p.add_contact('address', address, 'legislature')
                p.image = photo_url
                yield p

        chairpage = lxmlize(CHAIR_URL)
        name = re.search('Chair (.*) -',
                         chairpage.xpath('string(//title)')).group(1)
        email = chairpage.xpath(
            'string(//a[contains(text(), "E-mail")]/@href)')
        phone = chairpage.xpath(
            'string((//span[@class="labelTag"][contains(text(), "Phone")]/parent::*/text())[1])'
        ).strip(':')
        address = '\n'.join(
            chairpage.xpath('//div[@class="contactBody"]//p[1]/text()'))
        photo_url_src = chairpage.xpath(
            'string(//div[@id="contentIntleft"]//img[1]/@src)')
        photo_url = urljoin(CHAIR_URL, photo_url_src)
        p = Legislator(name=name, post_id='Waterloo', role='Regional Chair')
        p.add_source(CHAIR_URL)
        p.add_contact('email', email, None)
        p.add_contact('voice', phone, 'legislature')
        p.add_contact('address', address, 'legislature')
        p.image = photo_url
        yield p

示例#35

0

显示文件

    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        councillor_links = page.xpath(
            '//span[@class="textimagetype"]//a[contains(text(), "- Ward")]')
        for councillor_link in councillor_links:
            name, district = councillor_link.text.split(' - ')
            cpage_url = councillor_link.attrib['href']
            cpage = lxmlize(cpage_url)
            p = Legislator(name=name, post_id=district, role='Councillor')
            p.add_source(COUNCIL_PAGE)
            p.add_source(cpage_url)

            email = cpage.xpath('string(//a[contains(@href, "@")])')
            p.add_contact('email', email, None)

            phone = cpage.xpath(
                'string(//text()[contains(., "Phone")])').split(':')[1]
            p.add_contact('voice', phone, 'legislature')

            img_url_rel = cpage.xpath('string((//span/img)[1]/@src)')
            img_url = urljoin(cpage_url, img_url_rel)
            p.image = img_url

            yield p

        page = lxmlize(MAYOR_PAGE)
        name = ' '.join(
            page.xpath('//p[contains(text(), "is married to")]/text()')
            [0].split()[:2])
        address = ' '.join(
            page.xpath('//p[contains(text(), "Mayor\'s Office")]/text()')[1:])
        phone, fax = page.xpath('//p[contains(text(), "Phone:")]/text()')[:-1]
        phone = phone.strip().replace('(', '').replace(') ', '-')
        fax = fax.strip().replace('(', '').replace(') ', '-').split(':')[1]
        email = page.xpath('//a[contains(@href, "mailto:")]/text()')[0]

        p = Legislator(name=name, post_id='Windsor', role='Mayor')
        p.add_source(MAYOR_PAGE)
        p.add_contact('address', address, 'legislature')
        p.add_contact('voice', phone, 'legislature')
        p.add_contact('fax', fax, 'legislature')
        p.add_contact('email', email, None)
        p.image = page.xpath(
            '//div[@class="sectioning"]//img[contains(@title, "Mayor")]/@src'
        )[0]
        yield p

示例#36

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

    def get_people(self):
        root = lxmlize(COUNCIL_PAGE)
        everyone = root.xpath('//span[@class="Title"]')
        mayornode = everyone[0]
        mayor = {}
        spantext = ' '.join(mayornode.xpath('.//text()'))
        mayor['name'] = re.search(r'[^(]+', spantext).group(0).strip()
        mayor['photo_url'] = urljoin(COUNCIL_PAGE,
                                     mayornode.xpath('img/@src')[0])
        mayor['email'] = mayornode.xpath('following::a[1]/text()')[0]

        m = Legislator(name=mayor['name'],
                       post_id='Charlottetown',
                       role='Mayor')
        m.add_source(COUNCIL_PAGE)
        m.add_contact('email', mayor['email'], None)
        m.image = mayor['photo_url']

        yield m

        for span in root.xpath('//span[@class="Title"]')[1:]:
            spantext = ' '.join(span.xpath('.//text()'))
            header = spantext.replace(u'\u2013', '-').split('-')
            if len(header) != 2:
                continue

            name = header[0].strip()
            name = name.replace('Councillor', '')
            name = re.sub(r'\(.+?\)', '', name)
            name = ' '.join(name.split())

            district_name = header[1].strip()
            district_id = ' '.join(header[1].split()[:2])

            # needed a wacky xpath to deal with ward 8
            photo = span.xpath('preceding::hr[1]/following::img[1]/@src')
            photo_url = urljoin(COUNCIL_PAGE, photo[0])

            email = span.xpath('string(following::a[1]/text())')

            p = Legislator(name=name, post_id=district_id, role='Councillor')
            p.add_source(COUNCIL_PAGE)
            if email:
                p.add_contact('email', email, None)
            p.image = photo_url

            yield p

示例#37

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)
        table_data = page.xpath('//div[@id="litcontentDiv"]//tr')
        council_data = table_data[2:-1]

        mayor_row = table_data[0]

        photo_url_rel = mayor_row.xpath('string(.//img/@src)')
        photo_url = urljoin(COUNCIL_PAGE, photo_url_rel)
        contact_node = mayor_row.xpath('./td')[1]
        name = contact_node.xpath('string(.//strong)')
        raw_email = contact_node.xpath('string(.//a[contains(., "@")]/@href)')
        email = re.match('(?:mailto:)?(.*)', raw_email).group(1)

        p = Legislator(name=name, post_id='Sault Ste. Marie', role='Mayor')
        p.add_source(COUNCIL_PAGE)
        p.add_contact('email', email, None)
        p.image = photo_url
        yield p

        #alternate between a row represneting a ward name and councilors
        for ward_row, data_row in zip(*[iter(council_data)] * 2):
            district = ward_row.xpath('string(.//text()[contains(., "Ward")])')
            district_num = district_name_using_number(district)
            for councillor_node in data_row.xpath('./td'):
                name = councillor_node.xpath('string(.//strong)')
                if not name:  #bad markup
                    name = councillor_node.xpath(
                        'string(.//strong/following-sibling::'
                        'text())')
                raw_email = councillor_node.xpath(
                    'string(.//a[contains(., "@")]/@href)')
                email = re.match('(?:mailto:)?(.*)', raw_email).group(1)
                photo_url_rel = councillor_node.xpath('string(.//img/@src)')
                photo_url = urljoin(COUNCIL_PAGE, photo_url_rel)
                # address and phone are brittle, inconsistent

                p = Legislator(name=name,
                               post_id=district_num,
                               role='Councillor')
                p.add_source(COUNCIL_PAGE)
                if email:
                    p.add_contact('email', email, None)
                p.image = photo_url

                yield p

示例#38

0

显示文件

    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//a[contains(@title, "Profile")][1]/@href')
        for councillor in councillors:
            page = lxmlize(councillor)
            info = page.xpath('//table/tbody/tr/td[2]')[0]

            for br in info.xpath('*//br'):
                br.tail = '\n' + br.tail if br.tail else '\n'
            lines = [
                line.strip() for line in info.text_content().split('\n')
                if line.strip()
            ]
            text = '\n'.join(lines)
            name = lines[0].replace('Councillor ', '').replace('Mayor ', '')

            if lines[1].endswith(' Ward'):
                district = lines[1].replace(' Ward', '')
                role = 'Councillor'
            elif lines[1] == 'At Large':
                district = 'Thunder Bay'
                role = 'Councillor'
            else:
                district = 'Thunder Bay'
                role = 'Mayor'
            name = name.replace('Councillor',
                                '').replace('At Large',
                                            '').replace('Mayor', '').strip()

            p = Legislator(name=name, post_id=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(councillor)

            p.image = page.xpath('//td[@valign="top"]/img/@src')[0]

            address = ', '.join(info.xpath('./p/text()')[0:2]).strip()
            address = re.sub(r'\s{2,}', ' ', address)

            p.add_contact('address', address, 'legislature')

            contacts = info.xpath('./p[2]/text()')
            for contact in contacts:
                contact_type, contact = contact.split(':')
                contact = contact.replace('(1st)', '').replace('(2nd)',
                                                               '').strip()
                if 'Fax' in contact_type:
                    p.add_contact('fax', contact, 'legislature')
                elif 'Email' in contact_type:
                    break
                else:
                    p.add_contact('voice', contact, contact_type)

            email = info.xpath(
                './/a[contains(@href, "mailto:")]')[0].text_content()
            p.add_contact('email', email, None)

            yield p

示例#39

0

显示文件

    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@id="printArea"]//table//tr//td')[4:-1]
        yield self.scrape_mayor(councillors[0])
        for councillor in councillors[1:]:
            name = ' '.join(
                councillor.xpath('string(.//strong/a[last()])').split())
            infostr = councillor.xpath('string(.//strong)')
            try:
                district = infostr.split('-')[1]
                role = 'Councillor'
            except IndexError:
                district = 'Newmarket'
                role = 'Regional Councillor'
            url = councillor.xpath('.//a/@href')[0]

            p = Legislator(name=name, post_id=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            p.image = councillor.xpath('.//img/@src')[0]

            page = lxmlize(url)
            info = page.xpath('//div[@id="printArea"]')[0]
            info = info.xpath('.//p[@class="heading"][2]/following-sibling::p')
            address = info.pop(0).text_content().strip()
            if not address:
                address = info.pop(0).text_content().strip()

            if 'Ward' in info[0].text_content():
                info.pop(0)

            numbers = info.pop(0).text_content().split(':')
            email = page.xpath('//a[contains(@href, "mailto:")]/text()')[0]
            p.add_contact('email', email, None)
            for i, contact in enumerate(numbers):
                if i == 0:
                    continue
                if '@' in contact:
                    continue  # executive assistant email
                else:
                    number = re.findall(r'([0-9]{3}-[0-9]{3}-[0-9]{4})',
                                        contact)[0]
                    ext = re.findall(r'(Ext\. [0-9]{3,4})', contact)
                    if ext:
                        number = number + ext[0].replace('Ext. ', ' x')
                    contact_type = re.findall(r'[A-Za-z]+$', numbers[i - 1])[0]
                if 'Fax' in contact_type:
                    p.add_contact('fax', number, 'legislature')
                elif 'Phone' in contact_type:
                    p.add_contact('voice', number, 'legislature')
                else:
                    p.add_contact(contact_type, number, contact_type)
            site = page.xpath('.//a[contains(text(), "http://")]')
            if site:
                p.add_link(site[0].text_content(), None)
            yield p

示例#40

0

显示文件

    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//h1[@class="title"]')
        for councillor in councillors:
            if not ',' in councillor.text_content():
                continue
            name, district = councillor.text_content().split(',')
            name = name.strip()
            if 'Mayor' in district:
                p = Legislator(name=name, post_id='Beaconsfield', role='Maire')
                p.add_source(COUNCIL_PAGE)
                p.image = councillor.xpath(
                    './parent::div/parent::div/p//img/@src')[0]
                phone = councillor.xpath(
                    './/parent::div/following-sibling::div[contains(text(), "514")]/text()'
                )[0]
                phone = phone.split(':')[1].strip().replace(' ', '-')
                p.add_contact('voice', phone, 'legislature')
                script = councillor.xpath(
                    './/parent::div/following-sibling::div/script'
                )[0].text_content()
                p.add_contact('email', get_email(script), None)
                yield p
                continue

            district = district.split('-')[1].strip()
            p = Legislator(name=name, post_id=district, role='Conseiller')
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath(
                './parent::div/parent::div/p//img/@src')[0]

            phone = councillor.xpath(
                './/parent::div/following-sibling::p[contains(text(), "514")]/text()'
            )
            if phone:
                phone = phone[0]
                phone = phone.split(':')[1].strip().replace(' ', '-')
                p.add_contact('voice', phone, 'legislature')
            script = councillor.xpath(
                './/parent::div/following-sibling::p/script')[0].text_content(
                )
            p.add_contact('email', get_email(script), None)
            yield p

示例#41

0

显示文件

    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//table[@class="table_style"]/tbody/tr')[1:]
        for councillor in councillors:
            name = councillor.xpath('.//a')[0].text_content()
            district = 'District %s' % councillor.xpath(
                './/strong')[0].text_content()

            address = councillor.xpath('.//td')[3].text_content().replace(
                "\r\n", ', ')
            phone = councillor.xpath('.//td[5]/p/text()')[0].split(
                ':')[1].replace("(", '').replace(") ", '-')
            fax = councillor.xpath('.//td[5]/p/text()')[1].split(
                ':')[1].replace("(", '').replace(") ", '-')

            p = Legislator(name=name, post_id=district, role='Councillor')
            p.add_source(COUNCIL_PAGE)
            p.add_contact('address', address, 'legislature')
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('fax', fax, 'legislature')

            councillor_url = councillor.xpath('.//a/@href')[0]
            p.add_source(councillor_url)
            page = lxmlize(councillor_url)
            p.image = page.xpath('//img[@class="image_left"]/@src')[0]
            yield p

        mayorpage = lxmlize(MAYOR_PAGE)
        name_elem = mayorpage.xpath('//strong[contains(text(), "About")]')[0]
        name = re.search('About Mayor (.+):', name_elem.text).group(1)
        photo_url = mayorpage.xpath('string(//span/img/@src)')
        address_and_tel_elem = mayorpage.xpath(
            '//strong[contains(text(), "Contact")]/ancestor::p/'
            'following-sibling::p[1]')[0]
        address = address_and_tel_elem[0].text_content()
        phone = address_and_tel_elem[2].text.split(':')[1]

        p = Legislator(name=name, post_id='Cape Breton', role='Mayor')
        p.add_source(MAYOR_PAGE)
        p.add_contact('address', address, 'legislature')
        p.add_contact('voice', phone, 'legislature')
        # email is protected through JS
        p.image = photo_url
        yield p

示例#42

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):
    root = lxmlize(COUNCIL_PAGE)
    everyone = root.xpath('//span[@class="Title"]')
    mayornode = everyone[0]
    mayor = {}
    spantext = ' '.join(mayornode.xpath('.//text()'))
    mayor['name'] = re.search(r'[^(]+', spantext).group(0).strip()
    mayor['photo_url'] = urljoin(COUNCIL_PAGE, mayornode.xpath('img/@src')[0])
    mayor['email'] = mayornode.xpath('following::a[1]/text()')[0]

    m = Legislator(name=mayor['name'], post_id='Charlottetown', role='Mayor')
    m.add_source(COUNCIL_PAGE)
    m.add_contact('email', mayor['email'], None)
    m.image = mayor['photo_url']

    yield m

    for span in root.xpath('//span[@class="Title"]')[1:]:
      spantext = ' '.join(span.xpath('.//text()'))
      header = spantext.replace(u'\u2013', '-').split('-')
      if len(header) != 2:
          continue

      name = header[0].strip()
      name = name.replace('Councillor', '')
      name = re.sub(r'\(.+?\)', '', name)
      name = ' '.join(name.split())

      district_name = header[1].strip()
      district_id = ' '.join(header[1].split()[:2])

      # needed a wacky xpath to deal with ward 8
      photo = span.xpath('preceding::hr[1]/following::img[1]/@src')
      photo_url = urljoin(COUNCIL_PAGE, photo[0])

      email = span.xpath('string(following::a[1]/text())')

      p = Legislator(name=name, post_id=district_id, role='Councillor')
      p.add_source(COUNCIL_PAGE)
      if email:
        p.add_contact('email', email, None)
      p.image = photo_url

      yield p

示例#43

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//div[@id="printArea"]//table//tr//td')[4:-1]
    yield self.scrape_mayor(councillors[0])
    for councillor in councillors[1:]:
      name = ' '.join(councillor.xpath('string(.//strong/a[last()])').split())
      infostr = councillor.xpath('string(.//strong)')
      try:
        district = infostr.split('-')[1]
        role = 'Councillor'
      except IndexError:
        district = 'Newmarket'
        role = 'Regional Councillor'
      url = councillor.xpath('.//a/@href')[0]

      p = Legislator(name=name, post_id=district, role=role)
      p.add_source(COUNCIL_PAGE)
      p.add_source(url)

      p.image = councillor.xpath('.//img/@src')[0]

      page = lxmlize(url)
      info = page.xpath('//div[@id="printArea"]')[0]
      info = info.xpath('.//p[@class="heading"][2]/following-sibling::p')
      address = info.pop(0).text_content().strip()
      if not address:
        address = info.pop(0).text_content().strip()

      if 'Ward' in info[0].text_content():
        info.pop(0)

      numbers = info.pop(0).text_content().split(':')
      email = page.xpath('//a[contains(@href, "mailto:")]/text()')[0]
      p.add_contact('email', email, None)
      for i, contact in enumerate(numbers):
        if i == 0:
          continue
        if '@' in contact:
          continue  # executive assistant email
        else:
          number = re.findall(r'([0-9]{3}-[0-9]{3}-[0-9]{4})', contact)[0]
          ext = re.findall(r'(Ext\. [0-9]{3,4})', contact)
          if ext:
            number = number + ext[0].replace('Ext. ', ' x')
          contact_type = re.findall(r'[A-Za-z]+$', numbers[i - 1])[0]
        if 'Fax' in contact_type:
          p.add_contact('fax', number, 'legislature')
        elif 'Phone' in contact_type:
          p.add_contact('voice', number, 'legislature')
        else:
          p.add_contact(contact_type, number, contact_type)
      site = page.xpath('.//a[contains(text(), "http://")]')
      if site:
        p.add_link(site[0].text_content(), None)
      yield p

示例#44

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillor_links = page.xpath(
        '//span[@class="textimagetype"]//a[contains(text(), "- Ward")]')
    for councillor_link in councillor_links:
      name, district = councillor_link.text.split(' - ')
      cpage_url = councillor_link.attrib['href']
      cpage = lxmlize(cpage_url)
      p = Legislator(name=name, post_id=district, role='Councillor')
      p.add_source(COUNCIL_PAGE)
      p.add_source(cpage_url)

      email = cpage.xpath('string(//a[contains(@href, "@")])')
      p.add_contact('email', email, None)

      phone = cpage.xpath(
          'string(//text()[contains(., "Phone")])').split(':')[1]
      p.add_contact('voice', phone, 'legislature')

      img_url_rel = cpage.xpath(
          'string((//span/img)[1]/@src)')
      img_url = urljoin(cpage_url, img_url_rel)
      p.image = img_url

      yield p

    page = lxmlize(MAYOR_PAGE)
    name = ' '.join(page.xpath('//p[contains(text(), "is married to")]/text()')[0].split()[:2])
    address = ' '.join(page.xpath('//p[contains(text(), "Mayor\'s Office")]/text()')[1:])
    phone, fax = page.xpath('//p[contains(text(), "Phone:")]/text()')[:-1]
    phone = phone.strip().replace('(', '').replace(') ', '-')
    fax = fax.strip().replace('(', '').replace(') ', '-').split(':')[1]
    email = page.xpath('//a[contains(@href, "mailto:")]/text()')[0]

    p = Legislator(name=name, post_id='Windsor', role='Mayor')
    p.add_source(MAYOR_PAGE)
    p.add_contact('address', address, 'legislature')
    p.add_contact('voice', phone, 'legislature')
    p.add_contact('fax', fax, 'legislature')
    p.add_contact('email', email, None)
    p.image = page.xpath('//div[@class="sectioning"]//img[contains(@title, "Mayor")]/@src')[0]
    yield p

示例#45

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)
    table_data = page.xpath('//div[@id="litcontentDiv"]//tr')
    council_data = table_data[2:-1]

    mayor_row = table_data[0]
    
    photo_url_rel = mayor_row.xpath('string(.//img/@src)')
    photo_url = urljoin(COUNCIL_PAGE, photo_url_rel)
    contact_node = mayor_row.xpath('./td')[1]
    name = contact_node.xpath('string(.//strong)')
    raw_email = contact_node.xpath('string(.//a[contains(., "@")]/@href)')
    email = re.match('(?:mailto:)?(.*)', raw_email).group(1)

    p = Legislator(name=name, post_id='Sault Ste. Marie', role='Mayor')
    p.add_source(COUNCIL_PAGE)
    p.add_contact('email', email, None)
    p.image = photo_url
    yield p

    #alternate between a row represneting a ward name and councilors
    for ward_row, data_row in zip(*[iter(council_data)]*2):
      district = ward_row.xpath('string(.//text()[contains(., "Ward")])')
      district_num = district_name_using_number(district)
      for councillor_node in data_row.xpath('./td'):
        name = councillor_node.xpath('string(.//strong)')
        if not name: #bad markup
          name = councillor_node.xpath('string(.//strong/following-sibling::'
                                       'text())')
        raw_email = councillor_node.xpath('string(.//a[contains(., "@")]/@href)')
        email = re.match('(?:mailto:)?(.*)', raw_email).group(1)
        photo_url_rel = councillor_node.xpath('string(.//img/@src)')
        photo_url = urljoin(COUNCIL_PAGE, photo_url_rel)
        # address and phone are brittle, inconsistent

        p = Legislator(name=name, post_id=district_num, role='Councillor')
        p.add_source(COUNCIL_PAGE)
        if email:
          p.add_contact('email', email, None)
        p.image = photo_url

        yield p

示例#46

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//*[@class="two_third last"]')
        for councillor in councillors:
            if councillor == councillors[0]:
                yield self.scrape_mayor(councillor)
                continue

            name = councillor.xpath('.//a')[0].text_content().replace(
                'Councillor', '').replace('Mayor', '')
            info = councillor.xpath('.//text()[normalize-space()]')
            district = info[2]
            url = councillor.xpath('.//a')[0].attrib['href']

            p = Legislator(name=name, post_id=district, role='Councillor')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            p.add_contact('voice', info[3].replace('extension', 'x'),
                          'legislature')
            email = councillor.xpath('.//a[contains(@href,"mailto:")]')
            if email:
                email = email[0].text_content()
                p.add_contact('email', email, None)

            site = councillor.xpath('.//a[contains(text(),"Website")]')
            if site:
                p.add_link(site[0].attrib['href'], None)

            page = lxmlize(url)

            p.image = page.xpath('//header/img/@src')[0]

            address = re.findall(
                r'Address: (.*)Phone',
                page.xpath('//div[@class="entry-content"]')[0].text_content())
            if address:
                p.add_contact('address', address[0], 'legislature')

            blog = page.xpath('//a[contains(text(),"Blog")]')
            if blog:
                p.add_link(blog[0].attrib['href'], None)

            facebook = page.xpath(
                '//div[@class="entry-content"]//a[contains(@href, "facebook")]'
            )
            if facebook:
                p.add_link(facebook[0].attrib['href'], None)
            twitter = page.xpath(
                '//div[@class="entry-content"]//a[contains(@href, "twitter")]')
            if twitter:
                p.add_link(twitter[0].attrib['href'], None)
            yield p

示例#47

0

显示文件

def mayor_data(url):
  page = lxmlize(url)

  # Eliminate the word "Mayor" preceding the Mayor's name
  name = page.xpath('string(//h1)')[6:]
  p = Legislator(name=name, post_id='Waterloo', role='Mayor')
  p.add_source(COUNCIL_PAGE)
  p.add_source(url)
  p.image = photo_url(page)

  return p

示例#48

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//table[@class="table_style"]/tbody/tr')[1:]
    for councillor in councillors:
      name = councillor.xpath('.//a')[0].text_content()
      district = 'District %s' % councillor.xpath('.//strong')[0].text_content()

      address = councillor.xpath('.//td')[3].text_content().replace("\r\n", ', ')
      phone = councillor.xpath('.//td[5]/p/text()')[0].split(':')[1].replace("(", '').replace(") ", '-')
      fax = councillor.xpath('.//td[5]/p/text()')[1].split(':')[1].replace("(", '').replace(") ", '-')

      p = Legislator(name=name, post_id=district, role='Councillor')
      p.add_source(COUNCIL_PAGE)
      p.add_contact('address', address, 'legislature')
      p.add_contact('voice', phone, 'legislature')
      p.add_contact('fax', fax, 'legislature')

      councillor_url = councillor.xpath('.//a/@href')[0]
      p.add_source(councillor_url)
      page = lxmlize(councillor_url)
      p.image = page.xpath('//img[@class="image_left"]/@src')[0]
      yield p

    mayorpage = lxmlize(MAYOR_PAGE)
    name_elem = mayorpage.xpath('//strong[contains(text(), "About")]')[0]
    name = re.search('About Mayor (.+):', name_elem.text).group(1)
    photo_url = mayorpage.xpath('string(//span/img/@src)')
    address_and_tel_elem = mayorpage.xpath(
      '//strong[contains(text(), "Contact")]/ancestor::p/'
      'following-sibling::p[1]')[0]
    address = address_and_tel_elem[0].text_content()
    phone = address_and_tel_elem[2].text.split(':')[1]

    p = Legislator(name=name, post_id='Cape Breton', role='Mayor')
    p.add_source(MAYOR_PAGE)
    p.add_contact('address', address, 'legislature')
    p.add_contact('voice',  phone, 'legislature')
    # email is protected through JS
    p.image = photo_url
    yield p

示例#49

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

def councillor_data(url, name, ward):
  page = lxmlize(url)
  # sadly, email is a form on a separate page
  phone = page.xpath('string(//strong[contains(., "Phone")])').split(':')[1]
  photo_url_rel = page.xpath('string(//div[@id="contentcontainer"]//img/@src)')
  photo_url = urljoin(url, photo_url_rel)
  m = Legislator(name=name, post_id=ward, role='Councillor')
  m.add_source(COUNCIL_PAGE)
  m.add_source(url)
  m.add_contact('voice', phone, 'legislature')
  m.image = photo_url
  yield m

示例#50

0

显示文件

def mayor_data(node):
  name = node.xpath('string(.//strong)')[6:]
  phone = node.xpath('string(.//p[2]/text()[1])')
  email = node.xpath('string((.//a)[1])')
  photo_url = node.xpath('string(.//img/@src)')

  p = Legislator(name=name, post_id='Hamilton', role='Mayor')
  p.add_source(COUNCIL_PAGE)
  p.add_contact('email', email, None)
  p.add_contact('voice', phone, 'legislature')
  p.image = photo_url

  return p

示例#51

0

显示文件

def councillor_data(html):
  name = html.xpath('string(./div[@class="councillorInfo"]/a/text()[2])')
  email = html.xpath('string(./div[@class="emailInfo"])')
  district, phone = html.xpath('./div[@class="wardInfo"]/text()')
  photo = html.xpath('string((.//@src)[1])')

  p = Legislator(name=name, post_id=district, role='Councillor')
  p.add_source(COUNCIL_PAGE)
  p.add_contact('voice', phone, 'legislature')
  p.add_contact('email', email, None)
  p.image = photo

  return p

示例#52

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

def councillor_data(url, name, ward):
    page = lxmlize(url)
    # sadly, email is a form on a separate page
    phone = page.xpath('string(//strong[contains(., "Phone")])').split(':')[1]
    photo_url_rel = page.xpath(
        'string(//div[@id="contentcontainer"]//img/@src)')
    photo_url = urljoin(url, photo_url_rel)
    m = Legislator(name=name, post_id=ward, role='Councillor')
    m.add_source(COUNCIL_PAGE)
    m.add_source(url)
    m.add_contact('voice', phone, 'legislature')
    m.image = photo_url
    yield m