Python CanadianLegislator.add_link示例，utils.CanadianLegislator.add_link Python示例

示例#1

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

    def scrape_mayor(self, div):
        name = div.xpath('.//a')[0].text_content().replace('Mayor', '')
        url = div.xpath('.//a')[0].attrib['href']

        p = Legislator(name=name, post_id='Guelph', role='Mayor')
        p.add_source(COUNCIL_PAGE)
        p.add_source(url)

        phone = div.xpath('.//text()[normalize-space()]')[2]
        email = div.xpath('.//a[contains(@href,"mailto:")]')[0].text_content()

        page = lxmlize(url)

        p.add_contact('voice', phone, 'legislature')
        p.add_contact('email', email, None)
        p.add_link(
            page.xpath(
                '//div[@class="entry-content"]//a[contains(@href, "facebook")]'
            )[0].attrib['href'], None)
        p.add_link(
            page.xpath(
                '//div[@class="entry-content"]//a[contains(@href, "twitter")]')
            [0].attrib['href'], None)
        p.image = page.xpath('//header/img/@src')[0]

        return p

示例#2

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//ul[@class="subNav top"]/li/ul//li/a')
    for councillor in councillors:
      name = councillor.text_content()

      url = councillor.attrib['href']
      page = lxmlize(url)

      if councillor == councillors[0]:
        district = 'Ajax'
        role = 'Mayor'
      else:
        district = re.findall(r'Ward.*', page.xpath('//div[@id="printAreaContent"]//h1')[0].text_content())[0].strip()
        role = page.xpath('//div[@id="printAreaContent"]//h1')[0].text_content()
        role = re.findall('((Regional)? ?(Councillor))', role)[0][0]

      p = Legislator(name=name, post_id=district, role=role)
      p.add_source(COUNCIL_PAGE)
      p.add_source(url)

      p.image = page.xpath('//div[@class="intQuicklinksPhoto"]/img/@src')[0]

      contact_info = page.xpath('//table[@class="datatable"][1]//tr')[1:]
      for line in contact_info:
        contact_type = line.xpath('./td')[0].text_content().strip()
        contact = line.xpath('./td')[1].text_content().strip()
        if re.match(r'(Phone)|(Fax)|(Email)', contact_type):
          contact_type = CONTACT_DETAIL_TYPE_MAP[contact_type]
          p.add_contact(contact_type, contact, None if contact_type == 'email' else 'legislature')
        else:
          p.add_link(contact, None)
      yield p

示例#3

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):
    reader = csv_reader(COUNCIL_PAGE, header=True)
    for row in reader:
      kwargs = {'role': 'candidate'}
      email = None
      links = []
      extra = {}
      offices = []

      for k, v in row.items():
        v = v.strip()
        if not v:
          continue

        k = k.strip()
        match = re.search(r'\AOffice (\d): ', k)
        if match:
          index = int(match.group(1))
          while index > len(offices):
            offices.append({})
          if k[10:] == 'Type':
            offices[index - 1]['note'] = v
          elif k[10:] in CONTACT_TYPE_KEYS:
            offices[index - 1][CONTACT_TYPE_KEYS[k[10:]]] = v
          else:
            raise Exception(k)
        elif k == 'Party Name':
          kwargs['party'] = PARTY_MAP[v]
        elif k in KEYS:
          kwargs[KEYS[k]] = v
        elif k == 'Email':
          email = v
        elif k in LINKS_KEYS:
          links.append({'url': v, 'note': k})
        elif k in IGNORE_KEYS:
          continue
        elif k in EXTRA_KEYS:
          extra[re.sub(r'[^a-z0-9_]', '', k.lower().replace(' ', '_'))] = v
        else:
          raise Exception(k)

      contacts = []
      for office in offices:
        for _, type in CONTACT_TYPE_KEYS.items():
          if office.get(type):
            contacts.push({'note': office['note'], type: type, 'value': office[type]})

      if 'name' in kwargs:
        p = Legislator(**kwargs)
        p.add_source(COUNCIL_PAGE)
        if email:
          p.add_contact('email', email, None)
        for link in links:
          p.add_link(**links)
        for contact in contacts:
          p.add_contact(**contact)
        for k, v in extra.items():
          p.add_extra(k, v)
        yield p

示例#4

0

显示文件

    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@id="printArea"]//table//tr//td')[4:-1]
        yield self.scrape_mayor(councillors[0])
        for councillor in councillors[1:]:
            name = ' '.join(
                councillor.xpath('string(.//strong/a[last()])').split())
            infostr = councillor.xpath('string(.//strong)')
            try:
                district = infostr.split('-')[1]
                role = 'Councillor'
            except IndexError:
                district = 'Newmarket'
                role = 'Regional Councillor'
            url = councillor.xpath('.//a/@href')[0]

            p = Legislator(name=name, post_id=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            p.image = councillor.xpath('.//img/@src')[0]

            page = lxmlize(url)
            info = page.xpath('//div[@id="printArea"]')[0]
            info = info.xpath('.//p[@class="heading"][2]/following-sibling::p')
            address = info.pop(0).text_content().strip()
            if not address:
                address = info.pop(0).text_content().strip()

            if 'Ward' in info[0].text_content():
                info.pop(0)

            numbers = info.pop(0).text_content().split(':')
            email = page.xpath('//a[contains(@href, "mailto:")]/text()')[0]
            p.add_contact('email', email, None)
            for i, contact in enumerate(numbers):
                if i == 0:
                    continue
                if '@' in contact:
                    continue  # executive assistant email
                else:
                    number = re.findall(r'([0-9]{3}-[0-9]{3}-[0-9]{4})',
                                        contact)[0]
                    ext = re.findall(r'(Ext\. [0-9]{3,4})', contact)
                    if ext:
                        number = number + ext[0].replace('Ext. ', ' x')
                    contact_type = re.findall(r'[A-Za-z]+$', numbers[i - 1])[0]
                if 'Fax' in contact_type:
                    p.add_contact('fax', number, 'legislature')
                elif 'Phone' in contact_type:
                    p.add_contact('voice', number, 'legislature')
                else:
                    p.add_contact(contact_type, number, contact_type)
            site = page.xpath('.//a[contains(text(), "http://")]')
            if site:
                p.add_link(site[0].text_content(), None)
            yield p

示例#5

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//div[@id="printArea"]//table//tr//td')[4:-1]
    yield self.scrape_mayor(councillors[0])
    for councillor in councillors[1:]:
      name = ' '.join(councillor.xpath('string(.//strong/a[last()])').split())
      infostr = councillor.xpath('string(.//strong)')
      try:
        district = infostr.split('-')[1]
        role = 'Councillor'
      except IndexError:
        district = 'Newmarket'
        role = 'Regional Councillor'
      url = councillor.xpath('.//a/@href')[0]

      p = Legislator(name=name, post_id=district, role=role)
      p.add_source(COUNCIL_PAGE)
      p.add_source(url)

      p.image = councillor.xpath('.//img/@src')[0]

      page = lxmlize(url)
      info = page.xpath('//div[@id="printArea"]')[0]
      info = info.xpath('.//p[@class="heading"][2]/following-sibling::p')
      address = info.pop(0).text_content().strip()
      if not address:
        address = info.pop(0).text_content().strip()

      if 'Ward' in info[0].text_content():
        info.pop(0)

      numbers = info.pop(0).text_content().split(':')
      email = page.xpath('//a[contains(@href, "mailto:")]/text()')[0]
      p.add_contact('email', email, None)
      for i, contact in enumerate(numbers):
        if i == 0:
          continue
        if '@' in contact:
          continue  # executive assistant email
        else:
          number = re.findall(r'([0-9]{3}-[0-9]{3}-[0-9]{4})', contact)[0]
          ext = re.findall(r'(Ext\. [0-9]{3,4})', contact)
          if ext:
            number = number + ext[0].replace('Ext. ', ' x')
          contact_type = re.findall(r'[A-Za-z]+$', numbers[i - 1])[0]
        if 'Fax' in contact_type:
          p.add_contact('fax', number, 'legislature')
        elif 'Phone' in contact_type:
          p.add_contact('voice', number, 'legislature')
        else:
          p.add_contact(contact_type, number, contact_type)
      site = page.xpath('.//a[contains(text(), "http://")]')
      if site:
        p.add_link(site[0].text_content(), None)
      yield p

示例#6

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    mayor_url = page.xpath('//a[contains(text(), "Office of the Mayor")]/@href')[0]
    yield scrape_mayor(mayor_url)

    councillors = page.xpath('//div[@class="interiorContentWrapper"]//td[./a]')
    for councillor in councillors:
      name = councillor.xpath('.//strong')[1].text_content().strip()
      district = councillor.xpath('.//a//text()[normalize-space()]')[0]
      if 'Ward' in district:
        district = district.replace('Councillor', '')
        role = 'Councillor'
      else:
        role = district
        district = 'Markham'

      image = councillor.xpath('.//img/@src')[0]
      url = councillor.xpath('.//a/@href')[0]

      if 'Ward 4' in district:
        yield scrape_4(name, url, image)
        continue

      page = lxmlize(url)

      p = Legislator(name=name, post_id=district, role=role)
      p.add_source(COUNCIL_PAGE)
      p.add_source(url)

      p.image = image
      contact = page.xpath('//div[@class="microSiteLinksWrapper"]')[1]

      if contact.xpath('.//p/text()'):
        infos = contact.xpath('.//p/text()')
      else:
        infos = contact.xpath('.//div/text()')

      address = re.sub(r'\s{2,}', ' ', ' '.join(infos[:2])).strip()
      phone = infos[2].split(':')[1].strip()
      email = contact.xpath('.//a[contains(@href,"mailto:")]/text()')[0]
      website = contact.xpath('.//a[not( contains(@href, "mailto:"))]/text()')
      if website:
        p.add_link(website[0], None)
      p.add_contact('address', address, 'legislature')
      p.add_contact('voice', phone, 'legislature')
      p.add_contact('email', email, None)

      get_links(p, contact)
      yield p

示例#7

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//table[@id="MLAs"]//tr')[1:]
        for councillor in councillors:
            name = councillor.xpath('./td')[0].text_content().split('. ', 1)[1]
            party = councillor.xpath('./td')[1].text
            district = councillor.xpath('./td')[2].text_content()
            url = councillor.xpath('./td[1]/a/@href')[0]
            page = lxmlize(url)

            p = Legislator(name=name,
                           post_id=district,
                           role='MLA',
                           party=party)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            contact = page.xpath('//table[@id="mla-contact"]//tr[2]')[0]
            website = contact.xpath('./td[3]//div[3]//a')
            if website:
                p.add_link(website[0].text_content(), None)

            p.add_contact('address',
                          contact.xpath('./td[1]/div[2]')[0].text_content(),
                          'legislature')
            p.add_contact('address',
                          ''.join(contact.xpath('./td[2]/div//text()')[1:7]),
                          'constituency')
            numbers = [
                contact.xpath('./td[1]/div[3]')[0].text_content().split(
                    ':')[1].strip(),
                contact.xpath('./td[2]/div[4]//span/text()')[0],
                contact.xpath('./td[1]/div[4]')[0].text_content().split(':')
                [1].strip(),
                contact.xpath('./td[2]/div[5]//span/text()')[0],
            ]
            for index, number in enumerate(numbers):
                if len(number) < 10:
                    numbers[index] = '306-%s' % number
            p.add_contact('voice', numbers[0], 'legislature')
            p.add_contact('voice', numbers[1], 'constituency')
            p.add_contact('fax', numbers[2], 'legislature')
            p.add_contact('fax', numbers[3], 'constituency')
            p.add_contact(
                'email',
                contact.xpath('./td[3]//a[contains(@href, "mailto:")]/text()')
                [0], None)

            yield p

示例#8

0

显示文件

文件： people.py 项目： rhymeswithcycle/scrapers-ca

    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        yield scrape_mayor()

        councillors = page.xpath('//div[@id="centre_content"]//tr')
        for councillor in councillors:
            if "Position" in councillor.text_content():
                continue

            district = councillor.xpath("./td")[0].text_content().replace("Councillor", "")
            name = councillor.xpath("./td")[1].text_content()
            url = councillor.xpath("./td/a")[0].attrib["href"]

            p = Legislator(name=name, post_id=district, role="Councillor")
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            page = lxmlize(url)

            address = page.xpath('//div[@id="centre_content"]//p')[0].text_content().replace("\r\n", ", ")
            email = page.xpath('//a[contains(@href,"mailto:")]')[0].attrib["href"].replace("mailto:", "")
            p.add_contact("address", address, "legislature")
            p.add_contact("email", email, None)

            p.image = page.xpath('//div[@id="centre_content"]//img/@src')[0]

            numbers = page.xpath('//div[@id="centre_content"]//p[contains(text(),"-")]')[0].text_content()
            if "tel" in numbers:
                phone = (
                    re.findall(r"(.*)tel", numbers)[0]
                    .strip()
                    .replace(" ", "-")
                    .replace("\\xc2", "")
                    .replace("\\xa0", "-")
                )
                p.add_contact("voice", phone, "legislature")
            if "cell" in numbers:
                cell = re.findall(r"(.*)cell", numbers)[0].strip().replace(" ", "-")
                p.add_contact("cell", cell, "legislature")
            if "fax" in numbers:
                fax = re.findall(r"(.*)fax", numbers)[0].strip().replace(" ", "-")
                p.add_contact("fax", fax, "legislature")

            if len(page.xpath('//div[@id="centre_content"]//a')) > 2:
                p.add_link(page.xpath('//div[@id="centre_content"]//a')[-1].attrib["href"], None)
            yield p

示例#9

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//*[@class="two_third last"]')
        for councillor in councillors:
            if councillor == councillors[0]:
                yield self.scrape_mayor(councillor)
                continue

            name = councillor.xpath('.//a')[0].text_content().replace(
                'Councillor', '').replace('Mayor', '')
            info = councillor.xpath('.//text()[normalize-space()]')
            district = info[2]
            url = councillor.xpath('.//a')[0].attrib['href']

            p = Legislator(name=name, post_id=district, role='Councillor')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            p.add_contact('voice', info[3].replace('extension', 'x'),
                          'legislature')
            email = councillor.xpath('.//a[contains(@href,"mailto:")]')
            if email:
                email = email[0].text_content()
                p.add_contact('email', email, None)

            site = councillor.xpath('.//a[contains(text(),"Website")]')
            if site:
                p.add_link(site[0].attrib['href'], None)

            page = lxmlize(url)

            p.image = page.xpath('//header/img/@src')[0]

            address = re.findall(
                r'Address: (.*)Phone',
                page.xpath('//div[@class="entry-content"]')[0].text_content())
            if address:
                p.add_contact('address', address[0], 'legislature')

            blog = page.xpath('//a[contains(text(),"Blog")]')
            if blog:
                p.add_link(blog[0].attrib['href'], None)

            facebook = page.xpath(
                '//div[@class="entry-content"]//a[contains(@href, "facebook")]'
            )
            if facebook:
                p.add_link(facebook[0].attrib['href'], None)
            twitter = page.xpath(
                '//div[@class="entry-content"]//a[contains(@href, "twitter")]')
            if twitter:
                p.add_link(twitter[0].attrib['href'], None)
            yield p

示例#10

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def scrape_mayor(self, div):
    name = div.xpath('.//a')[0].text_content().replace('Mayor', '')
    url = div.xpath('.//a')[0].attrib['href']

    p = Legislator(name=name, post_id='Guelph', role='Mayor')
    p.add_source(COUNCIL_PAGE)
    p.add_source(url)

    phone = div.xpath('.//text()[normalize-space()]')[2]
    email = div.xpath('.//a[contains(@href,"mailto:")]')[0].text_content()

    page = lxmlize(url)

    p.add_contact('voice', phone, 'legislature')
    p.add_contact('email', email, None)
    p.add_link(page.xpath('//div[@class="entry-content"]//a[contains(@href, "facebook")]')[0].attrib['href'], None)
    p.add_link(page.xpath('//div[@class="entry-content"]//a[contains(@href, "twitter")]')[0].attrib['href'], None)
    p.image = page.xpath('//header/img/@src')[0]

    return p

示例#11

0

显示文件

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    mayor_contacts = page.xpath('//table[@class="nicEdit-visualClass"]//tr/td[1]/text()')
    council_contacts = page.xpath('//table[@class="nicEdit-visualClass"]//tr/td[2]/text()')

    councillors = page.xpath('//table[@id="Table3table"]//strong/ancestor::td')
    for councillor in councillors:
      name = councillor.xpath('.//strong/text()')[0]
      if 'Councillor' in name:
        name = name.replace('Councillor', '').strip()
        role_ward = councillor.xpath('./text()')[0]
        if not role_ward.strip():
          role_ward = councillor.xpath('.//p/text()')[0]
        role_ward = role_ward.split(' ')
        role = re.sub('\ACity ', '', ' '.join(role_ward[:2]))
        ward = ' '.join(role_ward[2:])
      else:
        name = councillor.xpath('.//strong/text()')[1]
        role = 'Mayor'
        ward = 'Pickering'
      email = councillor.xpath('.//a[contains(@href, "mailto:")]/text()')[0]
      p = Legislator(name=name, post_id=ward, role=role)
      p.add_source(COUNCIL_PAGE)
      p.add_contact('email', email, None)
      p.image = councillor.xpath('.//img/@src')[0]

      links = councillor.xpath('.//a')
      for link in links:
        if '@' in link.text_content():
          continue
        if 'Profile' in link.text_content():
          p.add_source(link.attrib['href'])
        else:
          p.add_link(link.attrib['href'], None)

      if role == 'Mayor':
        add_contacts(p, mayor_contacts)
      else:
        add_contacts(p, council_contacts)
      yield p

示例#12

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//center/center//a')
    for councillor in councillors:
      name = councillor.text_content().strip()
      url = councillor.attrib['href']
      page = lxmlize(url)
      header = page.xpath('//div[@class="sectionheading"]')[0].text_content()
      if header == 'Mayor of Richmond Hill':
        district = 'Richmond Hill'
        role = 'Mayor'
      else:
        district = re.findall(r',(.*)-', header)
        if district:
          district = district[0].strip()
        else:
          district = 'Richmond Hill'

        role = 'Regional Councillor' if 'Regional' in header else 'Councillor'

      info = page.xpath('//table[@cellpadding>0]/tbody/tr/td[last()]|//table[not(@cellpadding)]/tbody/tr/td[last()]')
      info = info[0].text_content().replace(' - office:', ':')

      address = re.findall(r'(?<=Town of Richmond Hill).*(?=Telephone)', info)[0]
      address = re.sub(r'([a-z])([A-Z])', r'\1 \2', address)
      phone = re.findall(r'(?<=Telephone:) (.*)(?=Fax)', info)[0].replace('(', '').replace(') ', '-').replace(', ext. ', ' x')
      fax = re.findall(r'(?<=Fax:) (.*)(?=E-mail)', info)[0].replace(' ', '').replace('(', '').replace(')', '-')
      email = page.xpath('.//a[contains(@href, "mailto:")]/@href')[0].replace('mailto:', '')

      p = Legislator(name=name, post_id=district, role=role)
      p.add_source(COUNCIL_PAGE)
      p.add_source(url)
      p.add_contact('address', address, 'legislature')
      p.add_contact('voice', phone, 'legislature')
      p.add_contact('fax', fax, 'legislature')
      p.add_contact('email', email, None)
      p.image = page.xpath('//img[contains(@alt, "%s")]/@src' % name)[0]
      if 'Website' in info:
        p.add_link(re.findall(r'www\..*\.[a-z]+', info)[0], None)
      yield p

示例#13

0

显示文件

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    yield scrape_mayor()

    councillors = page.xpath('//div[@id="centre_content"]//tr')
    for councillor in councillors:
      if 'Position' in councillor.text_content():
        continue

      district = councillor.xpath('./td')[0].text_content().replace('Councillor', '')
      name = councillor.xpath('./td')[1].text_content()
      url = councillor.xpath('./td/a')[0].attrib['href']

      p = Legislator(name=name, post_id=district, role='Councillor')
      p.add_source(COUNCIL_PAGE)
      p.add_source(url)

      page = lxmlize(url)

      address = page.xpath('//div[@id="centre_content"]//p')[0].text_content().replace("\r\n", ', ')
      email = page.xpath('//a[contains(@href,"mailto:")]')[0].attrib['href'].replace('mailto:', '')
      p.add_contact('address', address, 'legislature')
      p.add_contact('email', email, None)

      p.image = page.xpath('//div[@id="centre_content"]//img/@src')[0]

      numbers = page.xpath('//div[@id="centre_content"]//p[contains(text(),"-")]')[0].text_content()
      if 'tel' in numbers:
        phone = re.findall(r'(.*)tel', numbers)[0].strip().replace(' ', '-').replace("\\xc2", '').replace("\\xa0", '-')
        p.add_contact('voice', phone, 'legislature')
      if 'cell' in numbers:
        cell = re.findall(r'(.*)cell', numbers)[0].strip().replace(' ', '-')
        p.add_contact('cell', cell, 'legislature')
      if 'fax' in numbers:
        fax = re.findall(r'(.*)fax', numbers)[0].strip().replace(' ', '-')
        p.add_contact('fax', fax, 'legislature')

      if len(page.xpath('//div[@id="centre_content"]//a')) > 2:
        p.add_link(page.xpath('//div[@id="centre_content"]//a')[-1].attrib['href'], None)
      yield p

示例#14

0

显示文件

    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//ul[@class="subNav top"]/li/ul//li/a')
        for councillor in councillors:
            name = councillor.text_content()

            url = councillor.attrib['href']
            page = lxmlize(url)

            if councillor == councillors[0]:
                district = 'Ajax'
                role = 'Mayor'
            else:
                district = re.findall(
                    r'Ward.*',
                    page.xpath('//div[@id="printAreaContent"]//h1')
                    [0].text_content())[0].strip()
                role = page.xpath(
                    '//div[@id="printAreaContent"]//h1')[0].text_content()
                role = re.findall('((Regional)? ?(Councillor))', role)[0][0]

            p = Legislator(name=name, post_id=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            p.image = page.xpath(
                '//div[@class="intQuicklinksPhoto"]/img/@src')[0]

            contact_info = page.xpath('//table[@class="datatable"][1]//tr')[1:]
            for line in contact_info:
                contact_type = line.xpath('./td')[0].text_content().strip()
                contact = line.xpath('./td')[1].text_content().strip()
                if re.match(r'(Phone)|(Fax)|(Email)', contact_type):
                    contact_type = CONTACT_DETAIL_TYPE_MAP[contact_type]
                    p.add_contact(
                        contact_type, contact,
                        None if contact_type == 'email' else 'legislature')
                else:
                    p.add_link(contact, None)
            yield p

示例#15

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):
    yield scrape_mayor()

    page = lxmlize(COUNCIL_PAGE)
    councillor_cells = page.xpath('//th[contains(text(), "Ward")]')
    for cell in councillor_cells:
      district = cell.text
      name = cell[1].text
      page_url = cell[1].attrib['href']
      page = lxmlize(page_url)

      p = Legislator(name=name, post_id=district, role='Councillor')
      p.add_source(COUNCIL_PAGE)
      p.add_source(page_url)

      image = page.xpath('//div[@id="contentArea"]//img/@src')
      if image:
        p.image = image[0]

      address = page.xpath('//address//p')
      if address:
        address = address[0].text_content()
        p.add_contact('address', address, 'legislature')

      contacts = page.xpath('//table[@class="contactListing"]//tr')
      for contact in contacts:
        contact_type = contact.xpath('./th/text()')[0]
        value = contact.xpath('./td//text()')[0]
        if 'Title' in contact_type:
          continue
        elif 'Website' in contact_type or 'Facebook' in contact_type or 'Twitter' in contact_type:
          value = contact.xpath('./td/a/text()')[0]
          p.add_link(value, None)
        elif 'Telephone' in contact_type:
          p.add_contact('voice', value, 'legislature')
        elif 'Fax' in contact_type:
          p.add_contact('fax', value, 'legislature')
        elif 'Email' in contact_type:
          p.add_contact('email', value, None)
      yield p

示例#16

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

    def get_people(self):
        yield scrape_mayor()

        page = lxmlize(COUNCIL_PAGE)
        councillor_cells = page.xpath('//th[contains(text(), "Ward")]')
        for cell in councillor_cells:
            district = cell.text
            name = cell[1].text
            page_url = cell[1].attrib['href']
            page = lxmlize(page_url)

            p = Legislator(name=name, post_id=district, role='Councillor')
            p.add_source(COUNCIL_PAGE)
            p.add_source(page_url)

            image = page.xpath('//div[@id="contentArea"]//img/@src')
            if image:
                p.image = image[0]

            address = page.xpath('//address//p')
            if address:
                address = address[0].text_content()
                p.add_contact('address', address, 'legislature')

            contacts = page.xpath('//table[@class="contactListing"]//tr')
            for contact in contacts:
                contact_type = contact.xpath('./th/text()')[0]
                value = contact.xpath('./td//text()')[0]
                if 'Title' in contact_type:
                    continue
                elif 'Website' in contact_type or 'Facebook' in contact_type or 'Twitter' in contact_type:
                    value = contact.xpath('./td/a/text()')[0]
                    p.add_link(value, None)
                elif 'Telephone' in contact_type:
                    p.add_contact('voice', value, 'legislature')
                elif 'Fax' in contact_type:
                    p.add_contact('fax', value, 'legislature')
                elif 'Email' in contact_type:
                    p.add_contact('email', value, None)
            yield p

示例#17

0

显示文件

文件： people.py 项目： rhymeswithcycle/scrapers-ca

    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//table[@id="MLAs"]//tr')[1:]
        for councillor in councillors:
            name = councillor.xpath("./td")[0].text_content().split(". ", 1)[1]
            party = councillor.xpath("./td")[1].text
            district = councillor.xpath("./td")[2].text_content()
            url = councillor.xpath("./td[1]/a/@href")[0]
            page = lxmlize(url)

            p = Legislator(name=name, post_id=district, role="MLA", party=party)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            contact = page.xpath('//table[@id="mla-contact"]//tr[2]')[0]
            website = contact.xpath("./td[3]//div[3]//a")
            if website:
                p.add_link(website[0].text_content(), None)

            p.add_contact("address", contact.xpath("./td[1]/div[2]")[0].text_content(), "legislature")
            p.add_contact("address", "".join(contact.xpath("./td[2]/div//text()")[1:7]), "constituency")
            numbers = [
                contact.xpath("./td[1]/div[3]")[0].text_content().split(":")[1].strip(),
                contact.xpath("./td[2]/div[4]//span/text()")[0],
                contact.xpath("./td[1]/div[4]")[0].text_content().split(":")[1].strip(),
                contact.xpath("./td[2]/div[5]//span/text()")[0],
            ]
            for index, number in enumerate(numbers):
                if len(number) < 10:
                    numbers[index] = "306-%s" % number
            p.add_contact("voice", numbers[0], "legislature")
            p.add_contact("voice", numbers[1], "constituency")
            p.add_contact("fax", numbers[2], "legislature")
            p.add_contact("fax", numbers[3], "constituency")
            p.add_contact("email", contact.xpath('./td[3]//a[contains(@href, "mailto:")]/text()')[0], None)

            yield p

示例#18

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//*[@class="two_third last"]')
    for councillor in councillors:
      if councillor == councillors[0]:
        yield self.scrape_mayor(councillor)
        continue

      name = councillor.xpath('.//a')[0].text_content().replace('Councillor', '').replace('Mayor', '')
      info = councillor.xpath('.//text()[normalize-space()]')
      district = info[2]
      url = councillor.xpath('.//a')[0].attrib['href']

      p = Legislator(name=name, post_id=district, role='Councillor')
      p.add_source(COUNCIL_PAGE)
      p.add_source(url)

      p.add_contact('voice', info[3].replace('extension', 'x'), 'legislature')
      email = councillor.xpath('.//a[contains(@href,"mailto:")]')
      if email:
        email = email[0].text_content()
        p.add_contact('email', email, None)

      site = councillor.xpath('.//a[contains(text(),"Website")]')
      if site:
        p.add_link(site[0].attrib['href'], None)

      page = lxmlize(url)

      p.image = page.xpath('//header/img/@src')[0]

      address = re.findall(r'Address: (.*)Phone', page.xpath('//div[@class="entry-content"]')[0].text_content())
      if address:
        p.add_contact('address', address[0], 'legislature')

      blog = page.xpath('//a[contains(text(),"Blog")]')
      if blog:
        p.add_link(blog[0].attrib['href'], None)

      facebook = page.xpath('//div[@class="entry-content"]//a[contains(@href, "facebook")]')
      if facebook:
        p.add_link(facebook[0].attrib['href'], None)
      twitter = page.xpath('//div[@class="entry-content"]//a[contains(@href, "twitter")]')
      if twitter:
        p.add_link(twitter[0].attrib['href'], None)
      yield p

示例#19

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

  def get_people(self):
    page = lxmlize(COUNCIL_PAGE)

    councillors = page.xpath('//div[@class="PL_Column1"]//ul[@class="dfwp-list"][1]/li/div/div/a')
    for councillor in councillors:
      url = councillor.attrib['href']
      page = lxmlize(url)

      title = page.xpath('//div[@class="PL_Title"]')[0].text_content()
      if "Councillor" in title:
        district, name = re.split(r'Councillor', title)
        role = 'Councillor'
        if "Regional" in district:
          district = "Vaughan"
          role = 'Regional Councillor'
      else:
        name = re.split(r'Mayor', title)[-1]
        district = 'Vaughan'
        role = 'Mayor'
      name = name.strip()
      if councillor == councillors[0]:
        contact_info = page.xpath('//div[@id="WebPartWPQ2"]')[0]
      else:
        contact_info = page.xpath('//div[@id="WebPartWPQ3"]')[0]

      phone = re.findall(r'[0-9]{3}-[0-9]{3}-[0-9]{4} ext. [0-9]{4}', contact_info.text_content())[0].replace('ext. ', 'x')
      fax = re.findall(r'[0-9]{3}-[0-9]{3}-[0-9]{4}', contact_info.text_content())[1]
      email = contact_info.xpath('.//a[contains(@href, "mailto:")]')[0].text_content()

      p = Legislator(name=name, post_id=district.strip(), role=role)
      p.add_source(COUNCIL_PAGE)
      p.add_source(url)
      p.add_contact('voice', phone, 'legislature')
      p.add_contact('fax', fax, 'legislature')
      p.add_contact('email', email, None)

      image = page.xpath('//img[contains(@alt, "Councillor")]/@src')
      if image:
        p.image = image[0]

      sites = page.xpath('//div[@id="WebPartWPQ5"]')[0]

      if page.xpath('.//a[contains(@href,"facebook")]'):
        p.add_link(page.xpath('.//a[contains(@href,"facebook")]')[0].attrib['href'], None)
      if page.xpath('.//a[contains(@href,"twitter")]'):
        p.add_link(page.xpath('.//a[contains(@href,"twitter")]')[0].attrib['href'], None)
      if page.xpath('.//a[contains(@href,"youtube")]'):
        p.add_link(page.xpath('.//a[contains(@href, "youtube")]')[0].attrib['href'], None)
      yield p

示例#20

0

显示文件

文件： people.py 项目： rhymeswithcycle/scrapers-ca

    def get_people(self):
        screen_names = json.loads(requests.get("http://scrapers-ruby.herokuapp.com/twitter_users").content)

        page = lxmlize(COUNCIL_PAGE)
        rows = page.xpath('//div[@class="main-content"]//tr')[1:]
        for row in rows:
            name_cell = row.xpath("./td[1]")[0]
            last_name = name_cell.xpath("string(.//span[1])")
            first_name = name_cell.xpath("string(.//span[2])")
            name = "%s %s" % (first_name, last_name)
            constituency = row.xpath("string(./td[2])")
            province = row.xpath("string(./td[3])")
            party = row.xpath("string(./td[4])")

            url = name_cell.xpath("string(.//a/@href)")
            mp_page = lxmlize(url)
            email = mp_page.xpath('string(//span[@class="caucus"]/' 'a[contains(., "@")])')
            photo = mp_page.xpath('string(//div[@class="profile overview header"]//' "img/@src)")

            m = Legislator(name=name, post_id=constituency, role="MP", chamber="lower", party=party)
            m.add_source(COUNCIL_PAGE)
            m.add_source(url)
            screen_name = screen_names.get(name)
            if screen_name:
                m.add_link("https://twitter.com/%s" % screen_name)
            # @see http://www.parl.gc.ca/Parliamentarians/en/members/David-Yurdiga%2886260%29
            if email:
                m.add_contact("email", email, None)
            elif name == "Adam Vaughan":
                m.add_contact("email", "*****@*****.**", None)
            m.image = photo

            if mp_page.xpath('string(//span[@class="province"][1])') == u"Québec":
                m.add_contact("address", "Chambre des communes\nOttawa ON  K1A 0A6", "legislature")
            else:
                m.add_contact("address", "House of Commons\nOttawa ON  K1A 0A6", "legislature")
            voice = mp_page.xpath('string(//div[@class="hilloffice"]//span[contains(text(), "Telephone:")])')
            if voice:
                m.add_contact("voice", voice.replace("Telephone: ", ""), "legislature")
            fax = mp_page.xpath('string(//div[@class="hilloffice"]//span[contains(text(), "Fax:")])').replace(
                "Fax: ", ""
            )
            if fax:
                m.add_contact("fax", fax, "legislature")

            for li in mp_page.xpath('//div[@class="constituencyoffices"]//li'):
                spans = li.xpath('./span[not(@class="spacer")]')
                m.add_contact(
                    "address",
                    "\n".join(
                        [
                            spans[0].text_content(),  # address line 1
                            spans[1].text_content(),  # address line 2
                            spans[2].text_content(),  # city, region
                            spans[3].text_content(),  # postal code
                        ]
                    ),
                    "constituency",
                )
                voice = li.xpath('string(./span[contains(text(), "Telephone:")])').replace("Telephone: ", "")
                if voice:
                    m.add_contact("voice", voice, "constituency")
                fax = li.xpath('string(./span[contains(text(), "Fax:")])').replace("Fax: ", "")
                if fax:
                    m.add_contact("fax", fax, "constituency")

            yield m

示例#21

0

显示文件

文件： people.py 项目： fchagnon/scrapers-ca

    def get_people(self):
        page = lxmlize(COUNCIL_PAGE)

        councillors = page.xpath(
            '//div[@class="PL_Column1"]//ul[@class="dfwp-list"][1]/li/div/div/a'
        )
        for councillor in councillors:
            url = councillor.attrib['href']
            page = lxmlize(url)

            title = page.xpath('//div[@class="PL_Title"]')[0].text_content()
            if "Councillor" in title:
                district, name = re.split(r'Councillor', title)
                role = 'Councillor'
                if "Regional" in district:
                    district = "Vaughan"
                    role = 'Regional Councillor'
            else:
                name = re.split(r'Mayor', title)[-1]
                district = 'Vaughan'
                role = 'Mayor'
            name = name.strip()
            if councillor == councillors[0]:
                contact_info = page.xpath('//div[@id="WebPartWPQ2"]')[0]
            else:
                contact_info = page.xpath('//div[@id="WebPartWPQ3"]')[0]

            phone = re.findall(r'[0-9]{3}-[0-9]{3}-[0-9]{4} ext. [0-9]{4}',
                               contact_info.text_content())[0].replace(
                                   'ext. ', 'x')
            fax = re.findall(r'[0-9]{3}-[0-9]{3}-[0-9]{4}',
                             contact_info.text_content())[1]
            email = contact_info.xpath(
                './/a[contains(@href, "mailto:")]')[0].text_content()

            p = Legislator(name=name, post_id=district.strip(), role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('fax', fax, 'legislature')
            p.add_contact('email', email, None)

            image = page.xpath('//img[contains(@alt, "Councillor")]/@src')
            if image:
                p.image = image[0]

            sites = page.xpath('//div[@id="WebPartWPQ5"]')[0]

            if page.xpath('.//a[contains(@href,"facebook")]'):
                p.add_link(
                    page.xpath('.//a[contains(@href,"facebook")]')
                    [0].attrib['href'], None)
            if page.xpath('.//a[contains(@href,"twitter")]'):
                p.add_link(
                    page.xpath('.//a[contains(@href,"twitter")]')
                    [0].attrib['href'], None)
            if page.xpath('.//a[contains(@href,"youtube")]'):
                p.add_link(
                    page.xpath('.//a[contains(@href, "youtube")]')
                    [0].attrib['href'], None)
            yield p

示例#22

0

显示文件

    def get_people(self):
        reader = csv_reader(COUNCIL_PAGE, header=True)
        for row in reader:
            kwargs = {'role': 'candidate'}
            email = None
            links = []
            extra = {}
            offices = []

            for k, v in row.items():
                v = v.strip()
                if not v:
                    continue

                k = k.strip()
                match = re.search(r'\AOffice (\d): ', k)
                if match:
                    index = int(match.group(1))
                    while index > len(offices):
                        offices.append({})
                    if k[10:] == 'Type':
                        offices[index - 1]['note'] = v
                    elif k[10:] in CONTACT_TYPE_KEYS:
                        offices[index - 1][CONTACT_TYPE_KEYS[k[10:]]] = v
                    else:
                        raise Exception(k)
                elif k == 'Party Name':
                    kwargs['party'] = PARTY_MAP[v]
                elif k in KEYS:
                    kwargs[KEYS[k]] = v
                elif k == 'Email':
                    email = v
                elif k in LINKS_KEYS:
                    links.append({'url': v, 'note': k})
                elif k in IGNORE_KEYS:
                    continue
                elif k in EXTRA_KEYS:
                    extra[re.sub(r'[^a-z0-9_]', '',
                                 k.lower().replace(' ', '_'))] = v
                else:
                    raise Exception(k)

            contacts = []
            for office in offices:
                for _, type in CONTACT_TYPE_KEYS.items():
                    if office.get(type):
                        contacts.push({
                            'note': office['note'],
                            type: type,
                            'value': office[type]
                        })

            if 'name' in kwargs:
                p = Legislator(**kwargs)
                p.add_source(COUNCIL_PAGE)
                if email:
                    p.add_contact('email', email, None)
                for link in links:
                    p.add_link(**links)
                for contact in contacts:
                    p.add_contact(**contact)
                for k, v in extra.items():
                    p.add_extra(k, v)
                yield p