Exemplo n.º 1
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'utf-8')

        yield self.scrape_mayor(page)

        for tr in page.xpath('//tbody/tr'):
            if tr.xpath('./td[2]//text()')[0] != 'Vacant':
                district = tr.xpath('./td[1]/text()')[0]
                if 'Conseiller n' in district:
                    district = 'Greenfield Park'
                detail_url = tr.xpath('./td[2]/a/@href')[0]
                detail_page = self.lxmlize(detail_url, 'utf-8')

                name = detail_page.xpath('//h1/text()')[0]
                photo_node = detail_page.xpath('//img[contains(@alt, "{0}")]/@src'.format(name))
                if photo_node:
                    photo_url = photo_node[0]
                else:
                    photo_url = detail_page.xpath('//img[contains(@class, "droite")]/@src')[0]

                p = Person(primary_org='legislature', name=name, district=district, role='Conseiller')
                p.add_source(COUNCIL_PAGE)
                p.add_source(detail_url)
                p.image = photo_url
                yield p
Exemplo n.º 2
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'utf-8')
        councillors = page.xpath('//div[contains(@class, "member-box member-box--")]')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            name = councillor.xpath('.//div[@class="fiche__name"]/text()')[0]
            phone = councillor.xpath('.//div[@class="fiche__social"]/span/text()')[0].split('T')[1]
            email_mailto = councillor.xpath('.//div[@class="fiche__social"]/a[contains(@href, "mailto")]/@href')
            photo_url = councillor.xpath('.//img')[0].attrib['src']

            page = self.lxmlize(councillor.xpath('.//a[@class="member-box__calltoaction"]/@href')[0])
            district = page.xpath('.//div[@class="fiche__category"]/text()')[0]

            if district == 'Maire':
                district = 'Terrebonne'
                role = 'Maire'
            else:
                district = 'District {}'.format(district)
                role = 'Conseiller'

            p = Person(primary_org='legislature', name=name, district=district, role=role, image=photo_url)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('voice', phone, 'legislature')
            if email_mailto:
                email = email_mailto[0].split('mailto:')[1]
                p.add_contact('email', email)
            yield p
Exemplo n.º 3
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        mayor = page.xpath('//div[./div/h3[contains(text(), "Maire")]]/p/text()')
        m_name = mayor[0].strip().split('.')[1].strip()
        m_phone = mayor[1].strip().split(':')[1].strip()

        m = Person(primary_org='legislature', name=m_name, district='Saguenay', role='Maire')
        m.add_source(COUNCIL_PAGE)
        m.add_contact('voice', m_phone, 'legislature')

        yield m

        councillors = page.xpath('//div[./div/h3[contains(text(), "District")]]')
        for councillor in councillors:
            district = councillor.xpath('./div/h3')[0].text_content().replace('#', '')
            name = councillor.xpath('.//p/text()')[0].encode('latin-1').decode('utf-8')
            name = name.replace('M. ', '').replace('Mme ', '').strip()
            phone = councillor.xpath('.//p/text()')[1].split(':')[1].strip().replace(' ', '-')
            email = self.get_email(councillor)

            p = Person(primary_org='legislature', name=name, district=district, role='Conseiller')
            p.add_source(COUNCIL_PAGE)

            p.add_contact('voice', phone, 'legislature')
            p.add_contact('email', email)
            yield p
Exemplo n.º 4
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        councillors = page.xpath('//table//td[*]')

        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            district, role, name = councillor.xpath('./p[1]/text()')
            role = role.strip()

            if district == 'City of Oshawa':
                district = 'Oshawa'

            if role == 'City Councillor':
                role = 'Councillor'
            elif role == 'Regional & City Councillor':
                role = 'Regional Councillor'

            photo_url = councillor.xpath('./p/img/@src')[0]
            phone = self.get_phone(
                councillor.xpath('./p[contains(.//text(), "Phone")]')[0],
                area_codes=[905])

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role,
                       image=photo_url)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('email', self.get_email(councillor))
            yield p
Exemplo n.º 5
0
    def scrape(self):
        councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)
        nodes = page.xpath('//div[@class="view-content"]/div')
        for node in nodes:
            fields = node.xpath('./div')
            role = fields[0].xpath('./div//text()')[0]
            name = fields[2].xpath('.//a//text()')[0].title().split(role)[-1].strip()
            if name == 'Vacant':
                continue

            if 'Ward' in role:
                district = role
                role = 'Councillor'
            else:
                if 'At Large' in role:
                    role = 'Councillor at Large'
                    district = "St. John's (seat {})".format(councillor_seat_number)
                    councillor_seat_number += 1
                else:
                    district = "St. John's"
            phone = fields[3].xpath('./div//text()')[0]
            email = self.get_email(fields[5])
            photo_url = node.xpath('.//img/@src')[0]

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('email', email)
            p.image = photo_url
            yield p
Exemplo n.º 6
0
    def scrape(self):
        self.user_agent = CUSTOM_USER_AGENT
        page = self.get(COUNCIL_PAGE)
        members = re.findall('/Members/YourMember/[^"]+', page.text)
        assert len(members), 'No members found'
        for member in members:
            detail_url = 'http://www.assembly.nl.ca%s' % member
            detail = self.lxmlize(detail_url, user_agent=CUSTOM_USER_AGENT)

            name = detail.xpath('//h1/text()')[0]
            district = re.sub(r' [\xa0–-] ', '—', detail.xpath('//h2/text()')[0])  # # n-dash, m-dash
            party = PARTIES[detail.xpath('//h3/text()')[0]]

            p = Person(primary_org='legislature', name=name, district=district, role='MHA', party=party)
            p.image = detail.xpath('//img[@class="img-responsive"]/@src')[0]

            contact = detail.xpath('//div[@class="col-md-12"]')[0]
            p.add_contact('email', self.get_email(contact))

            p.add_source(COUNCIL_PAGE)
            p.add_source(detail_url)

            for heading, _type in HEADING_TYPE.items():
                node = detail.xpath('//b[.="%s"]/../..' % heading)
                if node:
                    phone = self.get_phone(node[0], error=False)
                    if phone:
                        p.add_contact('voice', phone, _type)

            yield p
Exemplo n.º 7
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, encoding='utf-8')
        members = page.xpath('//table/tbody/tr')
        assert len(members), 'No members found'
        for row in members:
            riding, table_name, email = (' '.join(td.text_content().split()) for td in row[1:])

            if 'Vacant' in table_name:
                continue

            district = riding.replace('\x97', '-')
            name_with_status, party_abbr = re.match(r'(.+) \((.+)\)', table_name).groups()
            name = name_with_status.split(',')[0]
            photo_page_url = row[2][0].attrib['href']
            photo_url = self.get_photo_url(photo_page_url)

            # @see https://en.wikipedia.org/wiki/Charlotte-Campobello
            if district == 'Saint Croix':
                district = 'Charlotte-Campobello'
            # @see https://en.wikipedia.org/wiki/Oromocto-Lincoln-Fredericton
            elif district == 'Oromocto-Lincoln-Fredericton':
                district = 'Oromocto-Lincoln'

            p = Person(primary_org='legislature', name=name, district=district, role='MLA',
                       party=get_party(party_abbr.strip()), image=photo_url)
            p.add_contact('email', email)
            p.add_source(photo_page_url)
            p.add_source(COUNCIL_PAGE)
            yield p
Exemplo n.º 8
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        mayor_info = page.xpath('//h2[contains(text(), "MAYOR")]//following-sibling::p')[0]
        yield self.scrape_mayor(mayor_info)

        wards = page.xpath('//h3')
        for ward in wards:
            district = re.sub('\AWARD \d+ - ', '', ward.text_content())
            councillors = ward.xpath('following-sibling::p')
            for councillor in councillors:
                name = councillor.xpath('./strong')[0].text_content()

                p = Person(primary_org='legislature', name=name, district=district, role='Councillor')
                p.add_source(COUNCIL_PAGE)

                info = councillor.xpath('./text()')
                address = info.pop(0)
                p.add_contact('address', address, 'legislature')

                # get phone numbers
                for line in info:
                    stuff = re.split(r'(\xbb)|(\xa0)', line)
                    tmp = [y for y in stuff if y and not re.match(r'\xa0', y)]
                    self.get_tel_numbers(tmp, p)

                email = self.get_email(councillor)
                p.add_contact('email', email)

                yield p
                if councillor == councillors[1]:
                    break
Exemplo n.º 9
0
    def scrape(self):
        councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)
        councillors = page.xpath('//div[contains(@class, "entry")]')[0].xpath('.//@href')
        assert len(councillors), 'No councillors found'
        for url in councillors:
            if '@' in url:
                continue

            page = self.lxmlize(url)
            main = page.xpath('//main[@id="content"]')[0]

            name = main.xpath('.//h1//text()')[0]

            if 'Mayor' in main.text_content():
                name = name.replace('Mayor ', '')
                role = 'Mayor'
                district = 'Saanich'
            else:
                role = 'Councillor'
                district = 'Saanich (seat {})'.format(councillor_seat_number)
                councillor_seat_number += 1

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.image = page.xpath('.//@src')[0]
            p.add_contact('voice', self.get_phone(page, area_codes=[250]), 'legislature')
            p.add_contact('email', self.get_email(page.xpath('//main[@id="content"]')[0]))
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            yield p
Exemplo n.º 10
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@id="printArea"]//strong')
        for councillor in councillors:
            info = councillor.xpath('./parent::p/text()')
            if not info:
                info = councillor.xpath('./parent::div/text()')
            info = [x for x in info if x.strip()]
            district = re.sub(r'(?<=Ward \d).+', '', info.pop(0))
            if 'Mayor' in district:
                district = 'Woolwich'
                role = 'Mayor'
            else:
                district = district.replace('Councillor', '').strip()
                role = 'Councillor'

            p = Person(primary_org='legislature', name=councillor.text_content(), district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.image = councillor.xpath('./img/@src')[0]

            for contact in info:
                note, num = contact.split(':')
                num = num.strip().replace('(', '').replace(') ', '-').replace('extension ', 'x')
                p.add_contact(note, num, note)
            yield p
Exemplo n.º 11
0
    def scrape(self):
        councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)

        for person_url in page.xpath('//h4/a/@href'):
            page = self.lxmlize(person_url)

            role, name = page.xpath('//title//text()')[0].split(' ', 1)
            photo_url = page.xpath('//div[@id="content"]//img[@style]/@src')[0]

            contact_node = page.xpath('//div[@id="column-right"]//div[contains(., "Contact")]')
            if contact_node:
                email = self.get_email(contact_node[0])
                phone = self.get_phone(contact_node[0], area_codes=[604, 778])

            if role == 'Mayor':
                district = 'Burnaby'
            else:
                district = 'Burnaby (seat {})'.format(councillor_seat_number)
                councillor_seat_number += 1

            p = Person(primary_org='legislature', name=name, district=district, role=role, image=photo_url)
            p.add_source(COUNCIL_PAGE)
            p.add_source(person_url)
            p.add_contact('email', email)
            if phone:
                p.add_contact('voice', phone, 'legislature')
            yield p
Exemplo n.º 12
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, user_agent=CUSTOM_USER_AGENT)

        mayor_url = page.xpath('//a[contains(text(), "Mayor")]/@href')[0]
        mayor = self.scrape_mayor(mayor_url)
        if mayor:
            yield mayor

        councillors_url = page.xpath('//a[contains(text(), "Councillors")]/@href')[0]
        cpage = self.lxmlize(councillors_url, user_agent=CUSTOM_USER_AGENT)

        councillors = cpage.xpath('//tr[td//img]')[:-1]

        assert len(councillors), 'No councillors found'
        for councillor_row in councillors:
            img_cell, info_cell = tuple(councillor_row)
            if info_cell.xpath('.//p//text()[contains(., "Vacant")]'):
                continue
            cells = [x.strip() for x in info_cell.xpath('.//text()') if re.sub('\xa0', ' ', x).strip()]
            name = cells[0].replace('Councillor ', '')
            district = info_cell.xpath('.//p[contains(text(), "District")]//text()')[0]
            email = self.get_email(info_cell)
            phone = self.get_phone(info_cell, area_codes=[438, 514], error=False)
            img_url_rel = img_cell.xpath('.//img/@src')[0]
            img_url = urljoin(councillors_url, img_url_rel)

            p = Person(primary_org='legislature', name=name, district=district, role='Conseiller')
            p.add_source(COUNCIL_PAGE)
            p.add_source(councillors_url)
            p.add_contact('email', email)
            if phone:
                p.add_contact('voice', phone, 'legislature')
            p.image = img_url
            yield p
Exemplo n.º 13
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillor_trs = [tr for tr in page.xpath('//table//tr[1]') if len(tr) == 2][:-1]
        for councillor_tr in councillor_trs:
            desc = [text.strip() for text in councillor_tr.xpath('.//text()[normalize-space()]') if text.strip()]

            if len(desc) == 3:
                role = 'Maire'
                district = 'Saint-Jérôme'
            else:
                role = 'Conseiller'
                district = desc[0].replace('numéro ', '')

            name = desc[-3]
            phone = desc[-2]
            email = desc[-1]

            image = councillor_tr.xpath('.//img/@src')[0]

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.image = image
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('email', email)
            yield p
Exemplo n.º 14
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        sections = page.xpath('//div[contains(@class, "membres-conseil-municipal")]')
        for section in sections:
            councillors = section.xpath('./div')
            assert len(councillors), 'No councillors found'
            for councillor in councillors:
                name = ' '.join(reversed(councillor.xpath('./h3//text()')))
                if 'vacant' in name.lower():
                    continue

                header = section.xpath('./preceding-sibling::h2/text()')[-1]
                if 'Mairie' in header:
                    district = 'Québec'
                    role = 'Maire'
                else:
                    district = councillor.xpath('./p[@itemprop="jobTitle"]/a/text()')[0]
                    district = re.search(r'\ADistrict (?:de(?: la)?|du|des) ([\w —–-]+)', district, flags=re.U).group(1)
                    role = 'Conseiller'

                if district == 'Saules':
                    district = 'Les Saules'
                else:
                    district = re.sub(r'–', '—', district)  # n-dash, m-dash

                p = Person(primary_org='legislature', name=name, district=district, role=role)
                p.add_source(COUNCIL_PAGE)
                p.image = councillor.xpath('./figure//@src')[0]
                p.add_contact('voice', self.get_phone(councillor, area_codes=[418]), 'legislature')
                yield p
Exemplo n.º 15
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        members = page.xpath('//table[1]//tr')

        assert len(members), 'No members found'
        for member in members:
            if not member.text_content().strip():
                continue

            name = member.xpath('./td[2]//a[1]//text()')[0]

            district_name = member.xpath(
                './td[2]//a[contains(.//text(), "MLA")]//text()')[0].split(
                    ':')[1].replace('St ', 'St. ').split('-')
            district = district_name[0].strip() + '-' + district_name[1].strip(
            )
            url = member.xpath('./td[2]//a[1]/@href')[0]
            ext_infos = self.scrape_extended_info(url)
            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role='MLA')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            if ext_infos:  # member pages might return errors
                email, phone, photo_url = ext_infos
                p.image = photo_url
                if email:
                    p.add_contact('email', email)
                if phone:
                    p.add_contact('voice', phone, 'legislature')
            yield p
Exemplo n.º 16
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//table[@id="Table1table"]/tbody/tr')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            name = councillor.xpath('./td[2]/p/text()')[1]
            role = councillor.xpath('./td[2]/p/text()')[0].strip()
            if role == 'Mayor and Regional Councillor':
                role = 'Mayor'
            elif role == 'Local & Regional Councillor':
                role = 'Regional Councillor'
            elif role == 'Local Councillor':
                role = 'Councillor'
            if len(councillor.xpath('./td[2]/p/text()')) < 3:
                district = 'Milton'
            else:
                district = councillor.xpath('./td[2]/p/text()')[2]

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath('./td[1]/p//img/@src')[0]

            if councillor == councillors[0]:
                address = ', '.join(councillor.xpath('./td[3]/p[1]/text()')).replace('Email:', '').strip()
                p.add_contact('address', address, 'legislature')

            numbers = councillor.xpath('./td[3]/p[2]/text()')
            for number in numbers:
                num_type, number = number.split(':')
                number = number.replace(', ext ', ' x').strip()
                p.add_contact(num_type, number, num_type)

            yield p
Exemplo n.º 17
0
    def scrape(self):
        councillor_seat_number = 1
        regional_councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)
        councillors = page.xpath('//table//td')

        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            if councillor.xpath('./p[1]/text()'):
                name, role = councillor.xpath('./p[1]/text()')
            else:
                name, role = councillor.xpath('./span[1]/text()')

            role = role.strip()

            if role == 'City Councillor':
                role = 'Councillor'
                district = 'Oshawa (seat {})'.format(councillor_seat_number)
                councillor_seat_number += 1
            elif role == 'Regional and City Councillor':
                role = 'Regional Councillor'
                district = 'Oshawa (seat {})'.format(regional_councillor_seat_number)
                regional_councillor_seat_number += 1
            else:
                district = 'Oshawa'

            photo_url = councillor.xpath('./p/img/@src')[0]
            phone = self.get_phone(councillor.xpath('./p[contains(.//text(), "Phone")]')[0], area_codes=[905])

            p = Person(primary_org='legislature', name=name, district=district, role=role, image=photo_url)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('email', self.get_email(councillor))
            yield p
Exemplo n.º 18
0
    def scrape(self):
        # mayor first, can't find email
        page = self.lxmlize(MAYOR_URL)
        photo_url = page.xpath('//img/@src[contains(., "maire")]')[0]
        name = page.xpath('//td[@class="contenu"]/text()[last()]')[0]
        p = Person(primary_org='legislature', name=name, district="Trois-Rivières", role="Maire",
                   image=photo_url)
        p.add_source(MAYOR_URL)
        yield p

        resp = self.get(COUNCIL_PAGE)
        # page rendering through JS on the client
        page_re = re.compile(r'createItemNiv3.+"District (.+?)".+(index.+)\\"')
        for district, url_rel in page_re.findall(resp.text):
            if district not in ('des Estacades', 'des Plateaux', 'des Terrasses', 'du Sanctuaire'):
                district = re.sub('\A(?:de(?: la)?|des|du) ', '', district)

            url = urljoin(COUNCIL_PAGE, url_rel)
            page = self.lxmlize(url)

            name_content = page.xpath('//h2//text()')
            if name_content:
                name = name_content[0]
                email = self.get_email(page)
                photo_url = page.xpath('//img/@src[contains(., "Conseiller")]')[0]
                p = Person(primary_org='legislature', name=name, district=district, role='Conseiller',
                           image=photo_url)
                p.add_source(url)
                p.add_contact('email', email)
                yield p
Exemplo n.º 19
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath(
            '//section[contains(@id, "js-council-member")]')
        assert len(councillors), 'No councillors found'
        for index, councillor in enumerate(councillors):
            name = ' '.join(councillor.xpath('.//h2/text()'))
            district = councillor.xpath(
                './/span[contains(@class, "c-info-list_label")][contains(text(), "District ")]'
            )
            role = 'Conseiller'

            if not district and index == 0:
                district = 'Pointe-Claire'
                role = 'Maire'
            elif district:
                district = district[0].text_content().split(' – ')[0]

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.image = councillor.xpath('.//@src')[0]
            p.add_contact('email', self.get_email(councillor))
            p.add_contact('voice', self.get_phone(councillor,
                                                  area_codes=[514]),
                          'legislature')
            p.add_source(COUNCIL_PAGE)
            yield p
Exemplo n.º 20
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE,
                            user_agent=CUSTOM_USER_AGENT,
                            encoding='windows-1252')

        councillors = page.xpath('//table[@width="800"]/tr')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            if councillor == councillors[0]:
                name = councillor.xpath('.//strong/text()')[0].replace(
                    'Monsieur', '').replace('Madame', '').strip()
                role = 'Maire'
                district = 'Mercier'
            else:
                name = councillor.xpath('.//strong/text()')[0].replace(
                    'Monsieur', '').replace('Madame', '').strip()
                role = 'Conseiller'
                district = 'District {}'.format(
                    re.search(r'(\d)',
                              councillor.xpath('.//text()')[3]).group(1))

            email = self.get_email(councillor)

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('email', email)
            yield p
Exemplo n.º 21
0
    def scrape(self):
        member_page = self.lxmlize(COUNCIL_PAGE, encoding='utf-8')
        table = member_page.xpath('//table')[0]
        rows = table.xpath('.//tr')[1:]
        assert len(rows), 'No members found'
        for row in rows:
            (namecell, constitcell, partycell) = row.xpath('.//td')
            full_name = namecell.text_content().strip()
            if full_name.lower() == 'vacant':
                continue
            (last, first) = full_name.split(',')
            name = first.replace('Hon.', '').strip() + ' ' + last.title().strip()
            district = ' '.join(constitcell.text_content().split())
            party = get_party(partycell.text)

            url = namecell.xpath('.//a')[0].get('href')

            page = self.lxmlize(url)
            email = self.get_email(page)

            p = Person(primary_org='legislature', name=name, district=district, role='MLA', party=party)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.add_contact('email', email)

            image = page.xpath('//img[@class="page_graphic"]/@src')
            if image:
                p.image = image[0]

            yield p
Exemplo n.º 22
0
 def scrape(self):
     csv_text = self.get(self.get_csv_url()).text
     cr = csv.DictReader(StringIO(csv_text))
     for mla in cr:
         name = '{} {} {}'.format(mla['MLA First Name'],
                                  mla['MLA Middle Names'],
                                  mla['MLA Last Name'])
         if name.strip() == '':
             continue
         party = get_party(mla['Caucus'])
         name_without_status = name.split(',')[0]
         detail_url = ('http://www.assembly.ab.ca/net/index.aspx?'
                       'p=mla_contact&rnumber={0}&leg=29'.format(
                           mla['Riding Number']))
         detail_page = self.lxmlize(detail_url)
         photo_url = detail_page.xpath('//img[@class="MemPhoto"]/@src')[0]
         p = Person(
             primary_org='legislature',
             name=name_without_status,
             district=mla['Riding Name'],
             role='MLA',
             party=party,
             image=photo_url,
         )
         p.add_source(COUNCIL_PAGE)
         p.add_source(detail_url)
         if mla['Email']:
             p.add_contact('email', mla['Email'])
         elif mla.get('MLA Email'):
             p.add_contact('email', mla['MLA Email'])
         if mla['Phone Number']:
             p.add_contact('voice', mla['Phone Number'], 'legislature')
         yield p
Exemplo n.º 23
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, encoding='utf-8')
        members = page.xpath('//table/tbody/tr')
        assert len(members), 'No members found'
        for row in members:
            riding, table_name, email = (' '.join(td.text_content().split())
                                         for td in row[1:])

            if 'Vacant' in table_name:
                continue

            district = riding.replace('\x97', '-')
            name_with_status, party_abbr = re.match(r'(.+) \((.+)\)',
                                                    table_name).groups()
            name = name_with_status.split(',')[0]
            photo_page_url = row[2][0].attrib['href']
            photo_url = self.get_photo_url(photo_page_url)

            # @see https://en.wikipedia.org/wiki/Charlotte-Campobello
            if district == 'Saint Croix':
                district = 'Charlotte-Campobello'
            # @see https://en.wikipedia.org/wiki/Oromocto-Lincoln-Fredericton
            elif district == 'Oromocto-Lincoln-Fredericton':
                district = 'Oromocto-Lincoln'

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role='MLA',
                       party=get_party(party_abbr.strip()),
                       image=photo_url)
            p.add_contact('email', email)
            p.add_source(photo_page_url)
            p.add_source(COUNCIL_PAGE)
            yield p
Exemplo n.º 24
0
    def scrape_mayor(self, url):
        infos_page = self.lxmlize(url)
        infos = infos_page.xpath('//div[@class="item-page"]')[0]

        name = ' '.join(infos.xpath('p[2]/text()')[0].split(' ')[2:4])
        lname = name.lower()
        email = lname.split(' ')[0][0] + lname.split(
            ' ')[1] + '@langleycity.ca'
        photo_url = infos.xpath('p[1]/img/@src')[0]

        p = Person(primary_org='legislature',
                   name=name,
                   district='Langley',
                   role='Mayor',
                   image=photo_url)
        p.add_source(COUNCIL_PAGE)
        p.add_source(url)
        p.add_contact('email', email)

        personal_infos = infos.xpath('p[last()]/text()')

        phone = re.findall(r'Phone(:?) (.*)', '\n'.join(personal_infos))[0][1]
        address = re.findall(r'Address: (.*) Phone',
                             ' '.join(personal_infos))[0]
        p.add_contact('address', address, 'office')
        p.add_contact('voice', phone, 'office')

        return p
Exemplo n.º 25
0
    def scrape(self):
        seat_numbers = defaultdict(int)

        page = self.lxmlize(COUNCIL_PAGE)

        yield self.scrape_mayor()

        councillors = page.xpath('//div[@id="centre_content"]//tr')
        for councillor in councillors:
            if 'Position' in councillor.text_content():
                continue

            ward = councillor.xpath('./td')[0].text_content().replace('Councillor', '')
            seat_numbers[ward] += 1
            district = '{} (seat {})'.format(ward, seat_numbers[ward])
            name = councillor.xpath('./td')[1].text_content()
            url = councillor.xpath('./td/a')[0].attrib['href']

            p = Person(primary_org='legislature', name=name, district=district, role='Councillor')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            page = self.lxmlize(url)

            content = page.xpath('//div[@id="centre_content"]')[0]
            email = self.get_email(content)
            p.add_contact('email', email)
            p.add_contact('voice', self.get_phone(content, area_codes=[226, 519]), 'legislature')

            p.image = page.xpath('string(//div[@id="centre_content"]//img/@src)')  # can be empty

            if len(page.xpath('//div[@id="centre_content"]//a')) > 2:
                p.add_link(page.xpath('//div[@id="centre_content"]//a')[-1].attrib['href'])
            yield p
Exemplo n.º 26
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//table[@id="Table1table"]/tbody/tr')
        assert len(councillors), 'No councillors found'
        for i, councillor in enumerate(councillors):
            role_district = councillor.xpath('./td[2]/p/text()')[0].strip()
            if 'Mayor' in role_district:
                name = role_district.replace('Mayor and Regional Councillor', '')
                role = 'Mayor'
                district = 'Milton'
            else:
                name = councillor.xpath('./td[2]/p/text()')[1]
                role, district = re.split(r' (?=Ward)', role_district)
                if role == 'Town and Regional Councillor':
                    role = 'Regional Councillor'
                elif role == 'Town Councillor':
                    role = 'Councillor'

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath('./td[1]/p//img/@src')[0]

            numbers = councillor.xpath('./td[3]/p[2]/text()')
            for number in numbers:
                num_type, number = number.split(':')
                number = number.replace(', ext ', ' x').strip()
                p.add_contact(num_type, number, num_type)

            yield p
Exemplo n.º 27
0
    def scrape(self):

        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//p[@class="WSIndent"]/a')
        for councillor in councillors:
            district = re.findall(r'(Ward [0-9]{1,2})', councillor.text_content())
            if district:
                district = district[0]
                name = councillor.text_content().replace(district, '').strip()
                role = 'Councillor'
            else:
                district = 'Kawartha Lakes'
                name = councillor.text_content().replace('Mayor', '').strip()
                role = 'Mayor'

            url = councillor.attrib['href']
            page = self.lxmlize(url)
            email = self.get_email(page)
            image = page.xpath('//img[@class="image-right"]/@src')[0]

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.add_contact('email', email)
            p.image = image
            yield p
Exemplo n.º 28
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//h1[@class="title"]')
        for councillor in councillors:
            if ',' not in councillor.text_content():
                continue
            name, district = councillor.text_content().split(',')
            name = name.strip()
            if 'Mayor' in district:
                p = Person(primary_org='legislature', name=name, district='Beaconsfield', role='Maire')
                p.add_source(COUNCIL_PAGE)
                p.image = councillor.xpath('./parent::div/parent::div/p//img/@src')[0]
                phone = councillor.xpath('.//parent::div/following-sibling::div[contains(text(), "514")]/text()')[0]
                phone = phone.split(':')[1].strip().replace(' ', '-')
                p.add_contact('voice', phone, 'legislature')
                script = councillor.xpath('.//parent::div/following-sibling::div/script')[0].text_content()
                p.add_contact('email', get_email(script))
                yield p
                continue

            district = district.split('-')[1].strip()
            p = Person(primary_org='legislature', name=name, district=district, role='Conseiller')
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath('./parent::div/parent::div/p//img/@src')[0]

            phone = councillor.xpath('.//parent::div/following-sibling::p[contains(text(), "514")]/text()')
            if phone:
                phone = phone[0]
                phone = phone.split(':')[1].strip().replace(' ', '-')
                p.add_contact('voice', phone, 'legislature')
            script = councillor.xpath('.//parent::div/following-sibling::p/script')[0].text_content()
            p.add_contact('email', get_email(script))
            yield p
Exemplo n.º 29
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@class="entry-content"]//p/strong')
        for councillor in councillors:
            district = councillor.xpath('./ancestor::p/preceding-sibling::h2')[-1].text_content().split('–'.decode('utf-8'))[0]
            name = ' '.join(councillor.text_content().split()[-2:]).replace('-Â'.decode('utf-8'), '')
            role = councillor.text_content().replace(name, '').split('-')[0]
            if 'SAO' in role or not role:
                continue

            org = Organization(name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id)
            org.add_source(COUNCIL_PAGE)
            yield org

            p = Person(primary_org='legislature', name=name, district=district)
            p.add_source(COUNCIL_PAGE)
            membership = p.add_membership(org, role=role, district=district)

            info = councillor.xpath('./ancestor::p/text()')
            for contact in info:
                if 'NT' in contact:
                    membership.add_contact_detail('address', contact.strip(), 'legislature')
                if 'Tel' in contact:
                    contact = contact.replace('Tel. ', '').replace('(', '').replace(') ', '-').strip()
                    membership.add_contact_detail('voice', contact, 'legislature')
                if 'Fax' in contact:
                    contact = contact.replace('Fax ', '').replace('(', '').replace(') ', '-').strip()
                    membership.add_contact_detail('fax', contact, 'legislature')
            email = self.get_email(councillor, './parent::p')
            membership.add_contact_detail('email', email)

            if 'Website' in councillor.xpath('./parent::p')[0].text_content():
                p.add_link(councillor.xpath('./parent::p//a')[1].attrib['href'])
            yield p
Exemplo n.º 30
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath(
            '//div[contains(@class, "councillorwrapper")]')
        assert len(councillors), 'No councillors found'
        for index, councillor in enumerate(councillors):
            name = councillor.xpath('.//h4/text()')[0]
            district = councillor.xpath('.//h4/span/text()')[0].strip()
            role = 'Councillor'
            email = None

            if not district and index == 0:
                district = 'Calgary'
                role = 'Mayor'
                email = '*****@*****.**'

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.image = councillor.xpath('.//@src')[0]
            if email:
                p.add_contact('email', email)
            p.add_source(COUNCIL_PAGE)
            yield p
Exemplo n.º 31
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'iso-8859-1')

        yield self.scrape_mayor()

        councillors = page.xpath('//div[@class="articlebody-inside"]//p[contains(text(),"-")]')
        for councillor in councillors:
            url = councillor.xpath('.//a')[0].attrib['href'].replace('../', '')
            page = self.lxmlize(url, 'iso-8859-1')

            name = page.xpath('//div[@class="articletitle"]/h1')[0].text_content().replace('Councillor', '').replace('Deputy Mayor', '')
            district = 'Ward {}'.format(re.sub(r'\D+', '', page.xpath('//div[@class="articlebody-inside"]/p')[0].text_content()))

            p = Person(primary_org='legislature', name=name, district=district, role='Councillor')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            photo_url_rel = page.xpath('//div[@class="articlebody-inside"]/p/img/@src')[0].replace('/..', '')
            p.image = urljoin(url, photo_url_rel)

            contacts = page.xpath('//div[@class="articlebody-inside"]/p')[1].text_content().replace('Biography', '').replace('Committees', '').split(':')
            for i, contact in enumerate(contacts):
                if i == 0 or not contact:
                    continue
                contact_type = re.findall(r'([A-Z][a-z]+)', contacts[i - 1])[0]
                if contact_type != 'Address':
                    contact = re.split(r'[A-Z]', contact)[0]
                contact_type = CONTACT_DETAIL_TYPE_MAP[contact_type]
                p.add_contact(contact_type, contact, '' if contact_type == 'email' else 'legislature')
            yield p
Exemplo n.º 32
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        nodes = page.xpath('//div[contains(@class,"cocis-has-caption")]')[1:]
        for node in nodes:
            url = urljoin(COUNCIL_PAGE, node.xpath('.//a[1]/@href')[0])
            name = node.xpath('.//a//text()')[0]
            ward = ' '.join(node.xpath('.//strong//text()')[0].split()[:-1])
            yield self.councillor_data(url, name, ward)

        mayor_node = page.xpath(
            '//div[contains(@class, "cocis-image-panel")]')[0]
        photo_url = urljoin(COUNCIL_PAGE, mayor_node.xpath('.//img/@src')[0])
        name = mayor_node.xpath('.//a//text()')[0]
        mayor_page = self.lxmlize(MAYOR_PAGE)
        # Email behind mailhide
        # email = self.get_email(mayor_page)
        phone = self.get_phone(mayor_page, area_codes=[403])
        m = Person(primary_org='legislature',
                   name=name,
                   district='Calgary',
                   role='Mayor')
        m.add_source(COUNCIL_PAGE)
        m.add_source(MAYOR_PAGE)
        m.add_contact('voice', phone, 'legislature')
        m.image = photo_url
        yield m
Exemplo n.º 33
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//table[@id="MLAs"]//tr')[1:]
        for councillor in councillors:
            if 'Vacant' not in councillor.xpath('./td')[0].text_content():
                name = councillor.xpath('./td')[0].text_content().split('. ', 1)[1]
                party = councillor.xpath('./td')[1].text
                district = councillor.xpath('./td')[2].text_content()
                url = councillor.xpath('./td[1]/a/@href')[0]
                page = self.lxmlize(url)

                p = Person(primary_org='legislature', name=name, district=district, role='MLA', party=party)
                p.add_source(COUNCIL_PAGE)
                p.add_source(url)
                p.image = page.xpath('//div[contains(@class, "mla-image-cell")]/img/@src')[0]

                contact = page.xpath('//div[@id="mla-contact"]/div[2]')[0]
                website = contact.xpath('./div[3]/div[3]/div[2]/a')
                if website:
                    p.add_link(website[0].text_content())

                p.add_contact('address', ' '.join(contact.xpath('.//div[@class="col-md-4"][2]/div//text()')[1:9]), 'constituency')
                phone_leg = contact.xpath('.//span[@id="MainContent_ContentBottom_Property6"]//text()')[0]
                phone_const = contact.xpath('.//div[@class="col-md-4"]/div[4]/span/span/text()')[0]
                p.add_contact('voice', phone_leg, 'legislature', area_code=306)
                p.add_contact('voice', phone_const, 'constituency', area_code=306)
                email = self.get_email(contact)
                p.add_contact('email', email)

                yield p
Exemplo n.º 34
0
    def scrape_mayor(self):
        page = self.lxmlize(MAYOR_PAGE, 'iso-8859-1')

        name = page.xpath(
            '//div[@class="articletitle"]/h1')[0].text_content().replace(
                'Mayor', '')

        p = Person(primary_org='legislature',
                   name=name,
                   district='Summerside',
                   role='Mayor')
        p.add_source(MAYOR_PAGE)
        p.image = page.xpath(
            '//div[@class="articlebody-inside"]/p/img/@src')[0].replace(
                '..', '')

        info = page.xpath('//div[@class="articlebody-inside"]/p')
        phone = re.findall(r'to (.*)', info[1].text_content())[0]
        address = info[3].text_content().replace(
            'by mail: ', '') + ' ' + info[4].text_content()
        email = self.get_email(info[5])

        p.add_contact('voice', phone, 'legislature')
        p.add_contact('address', address, 'legislature')
        p.add_contact('email', email)

        return p
Exemplo n.º 35
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath(
            '//div[contains(@class, "view-people")]//div[contains(@class, "views-row")]'
        )
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            name = councillor.xpath(
                './/div[@property="dc:title"]')[0].text_content()
            role_and_district = councillor.xpath(
                './/div[contains(@class, "field-name-field-sub-title")]//p'
            )[-2].text_content().replace('\xa0', ' ')

            if role_and_district == 'Mayor':
                district = 'Fredericton'
                role = 'Mayor'
            else:
                district = role_and_district.split(', ', 1)[1]
                role = 'Councillor'

            url = councillor.xpath('.//@href')[0]
            page = self.lxmlize(url)

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.image = councillor.xpath('.//img[@typeof="foaf:Image"]/@src')[0]
            p.add_contact('email', self.get_email(page))
            p.add_contact('voice', self.get_phone(page, area_codes=[506]),
                          'legislature')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            yield p
Exemplo n.º 36
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        yield self.scrape_mayor()

        councillors = page.xpath('//h2[@class="landing-block-title"]/a')[:-1]
        for councillor in councillors:
            url = councillor.attrib['href']
            page = self.lxmlize(url)

            district = page.xpath('//div[@id="main-content"]/h1/text()')[0]
            name = page.xpath('//div[@id="main-content"]/h2/text()')[0]

            p = Person(primary_org='legislature', name=name, district=district, role='Councillor')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            contacts = page.xpath('//aside[@class="page-sidebar"]/div[1]/p')
            for contact in contacts[:-1]:
                contact_type = contact.xpath('./strong/text()')[0]
                if 'Contact' in contact_type:
                    continue
                value = contact.xpath('./a/text()')[0]
                if 'Fax' in contact_type:
                    p.add_contact('fax', value, 'legislature')
                if 'Phone' in contact_type:
                    p.add_contact(contact_type, value, contact_type)

            yield p
Exemplo n.º 37
0
    def councillor_data(self, url, name, ward):
        page = self.lxmlize(url)
        # sadly, email is a form on a separate page
        photo_url_rel = page.xpath(
            '//div[contains(@id, "contentcontainer")]//img/@src')[0]
        photo_url = urljoin(url, photo_url_rel)

        m = Person(primary_org='legislature',
                   name=name,
                   district=ward,
                   role='Councillor')
        m.add_source(COUNCIL_PAGE)
        m.add_source(url)

        phone = self.get_phone(page.xpath('//div[@id="contentcontainer"]')[0],
                               area_codes=[306],
                               error=False)
        if phone:
            m.add_contact('voice', phone, 'legislature')
        else:
            phone = self.get_phone(
                page.xpath('//div[@id="lowercontentcontainer"]')[0],
                area_codes=[306],
                error=False)
            if phone:
                m.add_contact('voice', phone, 'legislature')

        m.image = photo_url
        yield m
Exemplo n.º 38
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)
        for block in page.xpath('//div[@class="addressblock"]'):
            name_elem = block.xpath('.//a[@class="mpp"]')[0]
            name = ' '.join(name_elem.text.split())

            riding = block.xpath('.//div[@class="riding"]//text()')[0].strip().replace('--', '\u2014')
            district = riding.replace('Chatham—Kent', 'Chatham-Kent')  # m-dash to hyphen
            mpp_url = name_elem.attrib['href']

            mpp_page = self.lxmlize(mpp_url)

            image = mpp_page.xpath('//img[@class="mppimg"]/@src')
            party = mpp_page.xpath('//div[@class="mppinfoblock"]/p[last()]/text()')[0].strip()

            p = Person(primary_org='legislature', name=name, district=district, role='MPP', party=party)
            if image:
                p.image = image[0]
            p.add_source(COUNCIL_PAGE)
            p.add_source(mpp_url)

            email = block.xpath('.//div[@class="email"]')
            if email:
                p.add_contact('email', self.get_email(email[0]))

            phone = block.xpath('.//div[@class="phone"]//text()')
            if phone:
                p.add_contact('voice', phone[0], 'legislature')

            yield p
Exemplo n.º 39
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'utf-8')

        yield self.scrape_mayor(page)

        trs = page.xpath('//tbody/tr')
        assert len(trs), 'No councillors found'
        seat_number = 1
        for tr in trs:
            if tr.xpath('./td[2]//text()')[0] != 'Vacant':
                district = tr.xpath('./td[1]/text()')[0]
                if 'Greenfield Park' in district or 'Conseiller n' in district:
                    district = 'Greenfield Park (siège {})'.format(seat_number)
                    seat_number += 1
                detail_url = tr.xpath('./td[2]/a/@href')[0]
                detail_page = self.lxmlize(detail_url, 'utf-8')

                name = detail_page.xpath('//h1/text()')[0]
                photo_node = detail_page.xpath(
                    '//img[contains(@alt, "{0}")]/@src'.format(name))
                if photo_node:
                    photo_url = photo_node[0]
                else:
                    photo_url = detail_page.xpath(
                        '//img[contains(@class, "droite")]/@src')[0]

                p = Person(primary_org='legislature',
                           name=name,
                           district=district,
                           role='Conseiller')
                p.add_source(COUNCIL_PAGE)
                p.add_source(detail_url)
                p.image = photo_url
                p.add_contact('email', self.get_email(detail_page))
                yield p
Exemplo n.º 40
0
    def scrape(self):
        member_page = self.lxmlize(COUNCIL_PAGE, encoding='utf-8')
        table = member_page.xpath('//table')[0]
        rows = table.xpath('.//tr')[1:]
        assert len(rows), 'No members found'
        for row in rows:
            (namecell, constitcell, partycell) = row.xpath('.//td')
            full_name = namecell.text_content().strip()
            if full_name.lower() == 'vacant':
                continue
            (last, first) = full_name.split(',')
            name = first.replace('Hon.',
                                 '').strip() + ' ' + last.title().strip()
            district = ' '.join(constitcell.text_content().split())
            party = get_party(partycell.text)

            url = namecell.xpath('.//a')[0].get('href')

            page = self.lxmlize(url)
            email = self.get_email(page)

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role='MLA',
                       party=party)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.add_contact('email', email)

            image = page.xpath('//img[@class="page_graphic"]/@src')
            if image:
                p.image = image[0]

            yield p
Exemplo n.º 41
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@class="field-item even"]//tr')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            district = councillor.xpath('./td[1]//strong/text()')[0].replace(
                'no. ', '')
            role = 'Conseiller'
            if 'Maire' in district:
                district = 'Senneville'
                role = 'Maire'
            name = councillor.xpath('./td[2]//p//text()')[0].title()
            email = self.get_email(councillor)
            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            try:
                p.image = councillor.xpath('.//img/@src')[0]
            except IndexError:
                pass
            p.add_contact('email', email)
            yield p
Exemplo n.º 42
0
    def scrape(self):
        regional_councillor_seat_number = 1
        page = self.lxmlize(COUNCIL_PAGE)

        yield self.scrape_mayor(page)

        councillor_nodes = page.xpath('//h3[contains(text(), "Councillors")]/following-sibling::p')[:-1]
        for councillor_node in councillor_nodes:
            text = ' '.join(councillor_node.xpath('./strong/text()'))
            if not text or 'Vacant' in text:
                continue

            name, role_district = text.split(', ', 1)

            if 'Regional Councillor' in role_district:
                role = role_district
                district = 'Whitby (seat {})'.format(regional_councillor_seat_number)
                regional_councillor_seat_number += 1
            else:
                role, district = role_district.strip().split(', ')
                district = district.split(' (')[0]

            email = self.get_email(councillor_node)
            image = councillor_node.xpath('./img/@src')[0]
            p = Person(primary_org='legislature', name=name, district=district, role=role, image=image)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('email', email)
            yield p
Exemplo n.º 43
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'utf-8')
        councillors = page.xpath('//div[@class="member-box member-box--gray"]')
        assert len(councillors), 'No councillors found'
        for councillor_elem in councillors:
            name = councillor_elem.xpath('.//div[@class="fiche__name"]/text()')[0]
            district = councillor_elem.xpath('.//div[@class="fiche__category"]/text()')[0]
            phone = councillor_elem.xpath('.//div[@class="fiche__social"]/span/text()')[0].split('T')[1]
            email_mailto = councillor_elem.xpath('.//div[@class="fiche__social"]/a[contains(@href, "mailto")]/@href')
            photo_url = councillor_elem.xpath('.//img')[0].attrib['src']

            p = Person(primary_org='legislature', name=name, district=district, role='Conseiller',
                       image=photo_url)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('voice', phone, 'legislature')
            if email_mailto:
                email = email_mailto[0].split('mailto:')[1]
                p.add_contact('email', email)
            yield p

        mayor_elem = page.xpath('//div[@class="member-box member-box--main"]')[0]
        name = mayor_elem.xpath('.//div[@class="fiche__name"]/text()')[0]
        phone = mayor_elem.xpath('.//div[@class="fiche__social"]/span/text()')[0].split('T')[1]
        email_mailto = mayor_elem.xpath('.//div[@class="fiche__social"]/a[contains(@href, "mailto")]/@href')
        photo_url = councillor_elem.xpath('.//img')[0].attrib['src']
        p = Person(primary_org='legislature', name=name, district='Terrebonne', role='Maire',
                   image=photo_url)
        p.add_source(COUNCIL_PAGE)
        p.add_contact('voice', phone, 'legislature')
        if email_mailto:
            email = email_mailto[0].split('mailto:')[1]
            p.add_contact('email', email)
        yield p
Exemplo n.º 44
0
    def scrape(self):
        councillor_seat_number = 1

        coun_page = self.lxmlize(COUNCIL_PAGE)
        contact_page = self.lxmlize(CONTACT_PAGE)
        councillors = coun_page.xpath('//div[@id="main-content"]//h3')
        contact_data = contact_page.xpath('//p[contains(./strong/text(), "Mayor & Council")]/following-sibling::table[1]//tr')[1:]

        for councillor, contact in zip(councillors, contact_data):
            text = councillor.text_content()
            if text.startswith('Councill'):
                role = 'Councillor'
                district = 'Abbotsford (seat {})'.format(councillor_seat_number)
                councillor_seat_number += 1
            else:
                role = 'Mayor'
                district = 'Abbotsford'
            name = text.split(' ', 1)[1]
            image = councillor.xpath('./img/@src')[0]
            phone = contact.xpath('./td[2]/text()')[0]
            fax = contact.xpath('./td[3]/text()')[0]

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(CONTACT_PAGE)
            p.image = image
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('fax', fax, 'legislature')

            yield p
Exemplo n.º 45
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@class="article-content"]//td[@class="ms-rteTableOddCol-0"]')
        yield self.scrape_mayor(councillors[0])
        assert len(councillors), 'No councillors found'
        for councillor in councillors[1:]:
            if not councillor.xpath('.//a'):
                continue

            texts = [text for text in councillor.xpath('.//text()') if clean_string(text)]
            name = texts[0]
            district = texts[1]
            url = councillor.xpath('.//a/@href')[0]
            page = self.lxmlize(url)

            p = Person(primary_org='legislature', name=name, district=district, role='Conseiller')
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)

            p.image = councillor.xpath('./preceding-sibling::td//img/@src')[-1]

            contacts = page.xpath('.//td[@class="ms-rteTableOddCol-0"]//text()')
            for contact in contacts:
                if re.findall(r'[0-9]{4}', contact):
                    phone = contact.strip().replace(' ', '-')
                    p.add_contact('voice', phone, 'legislature')
            get_links(p, page.xpath('.//td[@class="ms-rteTableOddCol-0"]')[0])

            email = self.get_email(page)
            p.add_contact('email', email)
            yield p
Exemplo n.º 46
0
    def scrape(self):
        councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)
        councillors = page.xpath(
            '//div[@id="content"]//table//tr[position() mod 2 = 1]')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            text = councillor.xpath('.//strong/text()')[0]
            if 'Deputy Warden' in text:
                role = 'Deputy Warden'
                name = text.replace('Deputy Warden', '')
                district = 'Lambton'
            elif 'Warden' in text:
                role = 'Warden'
                name = text.replace('Warden', '')
                district = 'Lambton'
            else:
                role = 'Councillor'
                name = text
                district = 'Lambton (seat {})'.format(councillor_seat_number)
                councillor_seat_number += 1

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath('.//img/@src')[0]
            p.add_contact('email', self.get_email(councillor))

            yield p
Exemplo n.º 47
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        mayor = page.xpath('.//div[@class="item-page clearfix"]//table[1]//p')[1]
        name = mayor.xpath('.//strong/text()')[0]

        p = Person(primary_org='legislature', name=name, district='Pointe-Claire', role='Maire')
        p.add_source(COUNCIL_PAGE)

        phone = re.findall(r'[0-9]{3}[ -][0-9]{3}-[0-9]{4}', mayor.text_content())[0].replace(' ', '-')
        p.add_contact('voice', phone, 'legislature')
        yield p

        rows = page.xpath('//tr')
        for i, row in enumerate(rows):
            if i % 2 == 0:
                continue
            councillors = row.xpath('./td')
            for j, councillor in enumerate(councillors):
                name = councillor.text_content()
                # rows[i + 1].xpath('.//td//a[contains(@href, "maps")]/text()')[j] # district number
                district = rows[i + 1].xpath('.//td/p[1]/text()')[j].replace(' / ', '/')

                p = Person(primary_org='legislature', name=name, district=district, role='Conseiller')
                p.add_source(COUNCIL_PAGE)
                p.image = councillor.xpath('.//img/@src')[0]

                phone = re.findall(r'[0-9]{3}[ -][0-9]{3}-[0-9]{4}', rows[i + 1].xpath('.//td')[j].text_content())[0].replace(' ', '-')

                p.add_contact('voice', phone, 'legislature')

                yield p
Exemplo n.º 48
0
    def scrape_councillor(self, url, district):
        infos_page = self.lxmlize(url)
        infos = infos_page.xpath('//div[@class="item-page"]')[0]

        name = ' '.join(infos.xpath('p[2]/text()')[0].split(' ')[1:3])
        lname = name.lower()
        email = lname.split(' ')[0][0] + lname.split(
            ' ')[1] + '@langleycity.ca'
        photo_url = infos.xpath('p[1]/img/@src')[0]

        p = Person(primary_org='legislature',
                   name=name,
                   district=district,
                   role='Councillor',
                   image=photo_url)
        p.add_source(COUNCIL_PAGE)
        p.add_source(url)
        p.add_contact('email', email)

        personal_infos = infos.xpath('p[last()]/text()')

        if 'Residence' in personal_infos[0]:
            phone = re.findall(r'(Phone|Res)(:?) (.*)',
                               '\n'.join(personal_infos))[0][2]
            address = re.findall(r'Address: (.*) (Phone|Res)',
                                 ' '.join(personal_infos))[0][0]
            p.add_contact('address', address, 'residence')
            p.add_contact('voice', phone, 'residence')

        return p
Exemplo n.º 49
0
    def scrape(self):
        regional_councillor_seat_number = 1

        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//center/center//a')
        for councillor in councillors:
            name = councillor.text_content().strip()
            url = councillor.attrib['href']
            page = self.lxmlize(url)
            header = page.xpath(
                '//div[@class="sectionheading"]')[0].text_content()
            if header == 'Mayor of Richmond Hill':
                district = 'Richmond Hill'
                role = 'Mayor'
            else:
                district = re.findall(r',(.*)-', header)
                if district:
                    district = district[0].strip()
                else:
                    district = 'Richmond Hill (seat {})'.format(
                        regional_councillor_seat_number)
                    regional_councillor_seat_number += 1

                role = 'Regional Councillor' if 'Regional' in header else 'Councillor'

            info = page.xpath(
                '//table[@cellpadding>0]/tbody/tr/td[last()]|//table[not(@cellpadding)]/tbody/tr/td[last()]'
            )
            info = info[0].text_content().replace(' - office:', ':')

            address = re.findall(
                r'(?<=Town of Richmond Hill)(.*(?=Telephone:)|(?=Telephone))',
                info)[0]
            address = re.sub(r'([a-z])([A-Z])', r'\1 \2', address)
            # I expected to be able to do '(.*)(?=\sTelephone|Telephone|Fax)', but nope.
            phone = re.findall(
                r'(?<=Telephone:) ((.*) (?=Telephone)|(.*)(?=Telephone)|(.*)(?=Fax))',
                info)[0][0].replace('(',
                                    '').replace(') ',
                                                '-').replace(', ext. ', ' x')
            fax = re.findall(r'(?<=Fax:) (.*)(?=E-mail)', info)[0].replace(
                ' ', '').replace('(', '').replace(')', '-')
            email = self.get_email(page)

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_source(url)
            p.add_contact('address', address, 'legislature')
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('fax', fax, 'legislature')
            p.add_contact('email', email)
            p.image = page.xpath(
                '//img[contains(@alt, "{}")]/@src'.format(name))[0]
            if 'Website' in info:
                p.add_link(re.findall(r'www\..*\.[a-z]+', info)[0])
            yield p
Exemplo n.º 50
0
    def scrape(self):
        regional_councillor_seat_number = 1
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//a[@title="Mayor and Council::Meet Your Council"]/following-sibling::ul//@href')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            node = self.lxmlize(councillor).xpath('//div[@id="printArea"]')[0]
            name = node.xpath('.//h1/text()')[0]

            if 'Mayor' in name:
                role = 'Mayor'
                district = 'Whitby'
                name = name.replace('Mayor ', '')
            else:
                role = node.xpath('.//h2/text()')[0]
                if 'Regional Councillor' in role:
                    district = 'Whitby (seat {})'.format(regional_councillor_seat_number)
                    regional_councillor_seat_number += 1
                else:
                    role, district = role.split(', ')
                    district = district.split(' (')[0]

            image = node.xpath('.//img/@src')[0]

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('voice', self.get_phone(node), 'legislature')
            p.add_contact('email', self.get_email(node))
            p.image = image

            yield p
Exemplo n.º 51
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[contains(@class, "ligne")]')
        for councillor in councillors:

            name = ' '.join(councillor.xpath('.//h3')[0].text_content().strip().split(', ')[::-1])
            if 'vacant' in name:
                continue
            district = councillor.xpath('./preceding-sibling::h2/text()')[-1]
            if 'Mairie' in district:
                district = 'Québec'
                role = 'Maire'
            else:
                text = councillor.xpath('.//a[@target="_blank"]/text()')
                district = re.search('\ADistrict électoral (?:de|du|des) (.+) - ?\d+\Z', text[0].strip().replace('\xa0', ''), flags=re.U).group(1)
                role = 'Conseiller'

            if district == 'Monts':
                district = 'Les Monts'
            elif district == 'Plateau':
                district = 'Le Plateau'
            else:
                district = re.sub('–', '—', district)  # n-dash, m-dash
                district = re.sub('\Ala ', 'La ', district)

            p = Person(primary_org='legislature', name=name, district=district, role=role)
            p.add_source(COUNCIL_PAGE)
            p.image = councillor.xpath('./p//img/@src')[0]

            phone = self.get_phone(councillor, area_codes=[418])
            p.add_contact('voice', phone, 'legislature')
            yield p
Exemplo n.º 52
0
    def scrape(self):
        regional_councillor_seat_number = 1
        page = self.lxmlize(COUNCIL_PAGE)

        yield self.scrape_mayor(page)

        councillors = page.xpath('//h3[contains(text(), "Councillors")]/following-sibling::p')[:-1]
        assert len(councillors), 'No councillors found'
        for councillor_node in councillors:
            text = councillor_node.xpath('./strong/text()')
            if not text or 'Vacant' in text:
                continue

            name, role_district = text
            name = name.rstrip(',')

            if 'Regional Councillor' in role_district:
                role = role_district
                district = 'Whitby (seat {})'.format(regional_councillor_seat_number)
                regional_councillor_seat_number += 1
            else:
                role, district = role_district.strip().split(', ')
                district = district.split(' (')[0]

            email = self.get_email(councillor_node)
            image = councillor_node.xpath('./img/@src')[0]
            p = Person(primary_org='legislature', name=name, district=district, role=role, image=image)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('email', email)
            yield p
Exemplo n.º 53
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'utf-8')
        councillors = page.xpath('//td[@width="105"]')
        assert len(councillors), 'No councillors found'
        for node in councillors:
            url = urljoin(COUNCIL_PAGE, node.xpath('.//a/@href')[0])
            ward = re.search('([A-Z].+) Ward',
                             node.xpath('.//a//text()')[0]).group(1)
            ward = ward.replace(' – ', '—').replace(
                ' - ', '—')  # n-dash, m-dash, hyphen, m-dash
            ward = ward.replace('St. Norbert',
                                'St Norbert')  # to match ocd-division-ids
            name = ' '.join(node.xpath('.//span[@class="k80B"][1]/text()'))
            yield self.councillor_data(url, name, ward)

        mayor_node = page.xpath('//td[@width="315"]')[0]
        mayor_name = mayor_node.xpath('./a//text()')[0][len('Mayor '):]
        mayor_photo_url = mayor_node.xpath('./img/@src')[0]
        m = Person(primary_org='legislature',
                   name=mayor_name,
                   district='Winnipeg',
                   role='Mayor')
        m.add_source(COUNCIL_PAGE)
        # @see http://www.winnipeg.ca/interhom/mayor/MayorForm.asp?Recipient=CLK-MayorWebMail
        m.add_contact('email', '*****@*****.**')  # hardcoded
        m.image = mayor_photo_url
        yield m
Exemplo n.º 54
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE, 'iso-8859-1')

        councillors = page.xpath('//div[@id="PageContent"]/table/tbody/tr/td')
        assert len(councillors), 'No councillors found'
        for councillor in councillors:
            if not councillor.text_content().strip():
                continue
            if councillor == councillors[0]:
                district = 'Kirkland'
                role = 'Maire'
            else:
                district = councillor.xpath('.//h2')[0].text_content()
                district = re.search('- (.+)', district).group(1).strip()
                district = district.replace(' Ouest',
                                            ' ouest').replace(' Est', ' est')
                role = 'Conseiller'

            name = councillor.xpath('.//strong/text()')[0]

            phone = councillor.xpath(
                './/div[contains(text(), "#")]/text()')[0].replace(
                    'T ', '').replace(' ', '-').replace(',-#-', ' x')
            email = self.get_email(councillor)

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)
            p.add_contact('voice', phone, 'legislature')
            p.add_contact('email', email)
            p.image = councillor.xpath('.//img/@src')[0]
            yield p
Exemplo n.º 55
0
 def scrape(self):
     csv_text = self.get(self.get_csv_url()).text
     cr = csv.DictReader(csv_text.split('\n'))
     for mla in cr:
         name = '{} {} {}'.format(mla['MLA First Name'], mla['MLA Middle Names'], mla['MLA Last Name'])
         if name.strip() == '':
             continue
         party = get_party(mla['Caucus'])
         name_without_status = name.split(',')[0]
         detail_url = (
             'http://www.assembly.ab.ca/net/index.aspx?'
             'p=mla_contact&rnumber={0}&leg=29'.format(
                 mla['Riding Number']
             )
         )
         detail_page = self.lxmlize(detail_url)
         photo_url = detail_page.xpath('//img[@class="MemPhoto"]/@src')[0]
         p = Person(
             primary_org='legislature',
             name=name_without_status,
             district=mla['Riding Name'],
             role='MLA',
             party=party,
             image=photo_url,
         )
         p.add_source(COUNCIL_PAGE)
         p.add_source(detail_url)
         if mla['Email']:
             p.add_contact('email', mla['Email'])
         elif mla['MLA Email']:
             p.add_contact('email', mla['MLA Email'])
         if mla['Phone Number']:
             p.add_contact('voice', mla['Phone Number'], 'legislature')
         yield p
Exemplo n.º 56
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        councillors = page.xpath('//div[@class="block text"]')
        assert len(councillors), 'No councillors found'
        for i, councillor in enumerate(councillors):
            name = councillor.xpath(
                './/div[@class="content-writable"]//strong/text()')[0]
            district = councillor.xpath('.//h2/text()')[0]

            if 'Maire' in district:
                district = 'Sainte-Anne-de-Bellevue'
                role = 'Maire'
            else:
                district = 'District {}'.format(re.search(r'\d+', district)[0])
                role = 'Conseiller'

            p = Person(primary_org='legislature',
                       name=name,
                       district=district,
                       role=role)
            p.add_source(COUNCIL_PAGE)

            p.image = councillor.xpath('.//@src')[0]
            p.add_contact('email', self.get_email(councillor))
            yield p
Exemplo n.º 57
0
    def scrape(self):
        page = self.lxmlize(COUNCIL_PAGE)

        corrections = {
            'Mackenzie Delta': 'Mackenzie-Delta',
            'Tu Nedhe - Wiilideh': 'Tu Nedhe',
        }

        member_cells = page.xpath('//div[@class="views-field views-field-field-picture"]/parent::td')
        for cell in member_cells:
            name = cell[1].text_content().replace(' .', '. ')  # typo on page
            riding = cell[2].text_content().strip()
            riding = corrections.get(riding, riding)

            detail_url = cell[0].xpath('.//a/@href')[0]
            detail_page = self.lxmlize(detail_url)
            photo_url = detail_page.xpath('//div[@class="field-item even"]/img/@src')[0]
            email = self.get_email(detail_page)

            contact_text = ''.join(detail_page.xpath('//div[@property="content:encoded"]/p[1]//text()'))
            phone = re.search(r'P(hone)?: ([-0-9]+)', contact_text)

            p = Person(primary_org='legislature', name=name, district=riding, role='MLA', image=photo_url)
            p.add_source(COUNCIL_PAGE)
            p.add_source(detail_url)
            p.add_contact('email', email)
            if phone:
                p.add_contact('voice', phone.group(2), 'legislature')
            yield p