Exemplo n.º 1
0
    def scrape(self):
        organizations = {}
        seat_numbers = defaultdict(lambda: defaultdict(int))

        reader = self.csv_reader(self.csv_url,
                                 delimiter=self.delimiter,
                                 header=True,
                                 encoding=self.encoding,
                                 skip_rows=self.skip_rows)
        reader.fieldnames = [
            self.header_converter(field) for field in reader.fieldnames
        ]
        for row in reader:

            try:
                if self.is_valid_row(row):
                    for key, corrections in self.corrections.items():
                        if not isinstance(corrections, dict):
                            row[key] = corrections(row[key])
                        elif row[key] in corrections:
                            row[key] = corrections[row[key]]

                    organization_classification = 'legislature'

                    organization_name = row['organization']
                    organization_key = organization_name.lower()
                    if organization_key in organizations:
                        organization = organizations[organization_key]
                    else:
                        organization = Organization(
                            organization_name,
                            classification=organization_classification)
                        organization.add_source(self.csv_url)
                        yield organization
                        organizations[organization_key] = organization

                    if not row['primary role']:
                        row['primary role'] = 'Councillor'

                    role = row['primary role']

                    post = Post(role=role,
                                label=organization_name,
                                organization_id=organization._id)
                    yield post

                    name = row['name'].strip(' .,')

                    district = row['district name']

                    if self.many_posts_per_area and role not in self.unique_roles:
                        seat_numbers[role][district] += 1
                        district = '{} (seat {})'.format(
                            district, seat_numbers[role][district])

                    p = Person(primary_org=organization_classification,
                               name=name,
                               district=district,
                               role=role,
                               party=row.get('party name'))
                    p.add_source(self.csv_url)

                    if row.get('gender'):
                        p.gender = row['gender']
                    if row.get('photo url'):
                        p.image = row['photo url']

                    if row.get('source url'):
                        p.add_source(row['source url'].strip(' .,'))

                    if row.get('website'):
                        p.add_link(row['website'], note='web site')
                    if row.get('facebook'):
                        p.add_link(re.sub(r'[#?].+', '', row['facebook']))
                    if row.get('twitter'):
                        p.add_link(row['twitter'])

                    if row['email']:
                        p.add_contact('email', row['email'].strip(' .,'))
                    if row['address']:
                        p.add_contact('address', row['address'], 'legislature')
                    if row.get('phone'):
                        p.add_contact('voice', row['phone'], 'legislature')
                    if row.get('fax'):
                        p.add_contact('fax', row['fax'], 'legislature')
                    if row.get('cell'):
                        p.add_contact('cell', row['cell'], 'legislature')
                    if row.get('birth date'):
                        p.birth_date = row['birth date']

                    if row.get('incumbent'):
                        p.extras['incumbent'] = row['incumbent']

                    if name in self.other_names:
                        for other_name in self.other_names[name]:
                            p.add_name(other_name)

                    # Validate person entity so that we can catch the exception if needed.
                    p.validate()

                    yield p
            except Exception as e:
                print(repr(e))
                continue
Exemplo n.º 2
0
    def scrape_people(self, rows, gender):
        assert len(rows), 'No members found'
        for row in rows:
            name = row.xpath(
                './/div[@class="ce-mip-mp-name"][1]')[0].text_content()
            constituency = row.xpath(
                './/div[@class="ce-mip-mp-constituency"][1]')[0].text_content(
                )
            constituency = constituency.replace('–', '—')  # n-dash, m-dash
            if constituency == 'Mont-Royal':
                constituency = 'Mount Royal'

            province = row.xpath(
                './/div[@class="ce-mip-mp-province"][1]')[0].text_content()

            party = row.xpath(
                './/div[@class="ce-mip-mp-party"][1]')[0].text_content()

            url = row.xpath('.//a[@class="ce-mip-mp-tile"]/@href')[0]

            if province == 'Québec':
                url = url.replace('/en/', '/fr/')

            mp_page = self.lxmlize(url)
            email = self.get_email(mp_page,
                                   '//*[@id="contact"]/div/p/a',
                                   error=False)

            photo = mp_page.xpath(
                './/div[@class="ce-mip-mp-profile-container"]//img/@src')[0]

            m = Person(primary_org='lower',
                       name=name,
                       district=constituency,
                       role='MP',
                       party=party)
            m.add_source(COUNCIL_PAGE)
            m.add_source(url)
            m.gender = gender
            # @see https://www.ourcommons.ca/Members/en/ziad-aboultaif(89156)
            if email:
                m.add_contact('email', email)

            if photo:
                # Determine whether the photo is actually a generic silhouette
                photo_response = self.get(photo)
                if (photo_response.status_code == 200
                        and hashlib.sha1(photo_response.content).hexdigest()
                        not in IMAGE_PLACEHOLDER_SHA1):
                    m.image = photo

            # I don't think the new parliment website has personal website anymore
            personal_url = mp_page.xpath(
                './/a[contains(@title, "Personal Web Site")]/@href')
            if personal_url:
                m.add_link(personal_url[0])

            preferred_languages = mp_page.xpath(
                './/dt[contains(., "Preferred Language")]/following-sibling::dd/text()'
            )
            if preferred_languages:
                m.extras['preferred_languages'] = [
                    language.replace('/', '').strip()
                    for language in preferred_languages
                ]

            if province == 'Québec':
                m.add_contact('address',
                              'Chambre des communes\nOttawa ON  K1A 0A6',
                              'legislature')
            else:
                m.add_contact('address',
                              'House of Commons\nOttawa ON  K1A 0A6',
                              'legislature')

            # Hill Office contacts
            # Now phone and fax are in the same element
            # <p>
            #   Telephone: xxx-xxx-xxxx<br/>
            #   Fax: xxx-xxx-xxx
            # </p>
            phone_and_fax_el = mp_page.xpath(
                './/h4[contains(., "Hill Office")]/../p[contains(., "Telephone")]|.//h4[contains(., "Hill Office")]/../p[contains(., "Téléphone :")]'
            )
            if len(phone_and_fax_el):
                phone_and_fax = phone_and_fax_el[0].text_content().strip(
                ).splitlines()
                voice = phone_and_fax[0].replace('Telephone:', '').replace(
                    'Téléphone :', '').strip()
                fax = phone_and_fax[1].replace('Fax:', '').replace(
                    'Télécopieur :', '').strip()
                if voice:
                    m.add_contact('voice', voice, 'legislature')

                if fax:
                    m.add_contact('fax', fax, 'legislature')

            # Constituency Office contacts
            # Some people has more than one, e.g. https://www.ourcommons.ca/Members/en/ben-lobb(35600)#contact
            for i, constituency_office_el in enumerate(
                    mp_page.xpath(
                        './/div[@class="ce-mip-contact-constituency-office-container"]/div'
                    )):
                note = 'constituency'
                if i:
                    note += ' ({})'.format(i + 1)

                address = constituency_office_el.xpath('./p[1]')[0]
                address = address.text_content().strip().splitlines()
                address = list(map(str.strip, address))
                m.add_contact('address', '\n'.join(address), note)

                phone_and_fax_el = constituency_office_el.xpath(
                    './p[contains(., "Telephone")]|./p[contains(., "Téléphone")]'
                )
                if len(phone_and_fax_el):
                    phone_and_fax = phone_and_fax_el[0].text_content().strip(
                    ).splitlines()
                    # Note that https://www.ourcommons.ca/Members/en/michael-barrett(102275)#contact
                    # has a empty value - "Telephone:". So the search / replace cannot include space.
                    voice = phone_and_fax[0].replace('Telephone:', '').replace(
                        'Téléphone :', '').strip()
                    if len(phone_and_fax) > 1:
                        fax = phone_and_fax[1].replace('Fax:', '').replace(
                            'Télécopieur :', '').strip()

                    if voice:
                        m.add_contact('voice', voice, note)

                    if fax:
                        m.add_contact('fax', fax, note)

            yield m