def scrape(self): organizations = {} seat_numbers = defaultdict(lambda: defaultdict(int)) reader = self.csv_reader(self.csv_url, delimiter=self.delimiter, header=True, encoding=self.encoding, skip_rows=self.skip_rows) reader.fieldnames = [ self.header_converter(field) for field in reader.fieldnames ] for row in reader: try: if self.is_valid_row(row): for key, corrections in self.corrections.items(): if not isinstance(corrections, dict): row[key] = corrections(row[key]) elif row[key] in corrections: row[key] = corrections[row[key]] organization_classification = 'legislature' organization_name = row['organization'] organization_key = organization_name.lower() if organization_key in organizations: organization = organizations[organization_key] else: organization = Organization( organization_name, classification=organization_classification) organization.add_source(self.csv_url) yield organization organizations[organization_key] = organization if not row['primary role']: row['primary role'] = 'Councillor' role = row['primary role'] post = Post(role=role, label=organization_name, organization_id=organization._id) yield post name = row['name'].strip(' .,') district = row['district name'] if self.many_posts_per_area and role not in self.unique_roles: seat_numbers[role][district] += 1 district = '{} (seat {})'.format( district, seat_numbers[role][district]) p = Person(primary_org=organization_classification, name=name, district=district, role=role, party=row.get('party name')) p.add_source(self.csv_url) if row.get('gender'): p.gender = row['gender'] if row.get('photo url'): p.image = row['photo url'] if row.get('source url'): p.add_source(row['source url'].strip(' .,')) if row.get('website'): p.add_link(row['website'], note='web site') if row.get('facebook'): p.add_link(re.sub(r'[#?].+', '', row['facebook'])) if row.get('twitter'): p.add_link(row['twitter']) if row['email']: p.add_contact('email', row['email'].strip(' .,')) if row['address']: p.add_contact('address', row['address'], 'legislature') if row.get('phone'): p.add_contact('voice', row['phone'], 'legislature') if row.get('fax'): p.add_contact('fax', row['fax'], 'legislature') if row.get('cell'): p.add_contact('cell', row['cell'], 'legislature') if row.get('birth date'): p.birth_date = row['birth date'] if row.get('incumbent'): p.extras['incumbent'] = row['incumbent'] if name in self.other_names: for other_name in self.other_names[name]: p.add_name(other_name) # Validate person entity so that we can catch the exception if needed. p.validate() yield p except Exception as e: print(repr(e)) continue
def scrape_people(self, rows, gender): assert len(rows), 'No members found' for row in rows: name = row.xpath( './/div[@class="ce-mip-mp-name"][1]')[0].text_content() constituency = row.xpath( './/div[@class="ce-mip-mp-constituency"][1]')[0].text_content( ) constituency = constituency.replace('–', '—') # n-dash, m-dash if constituency == 'Mont-Royal': constituency = 'Mount Royal' province = row.xpath( './/div[@class="ce-mip-mp-province"][1]')[0].text_content() party = row.xpath( './/div[@class="ce-mip-mp-party"][1]')[0].text_content() url = row.xpath('.//a[@class="ce-mip-mp-tile"]/@href')[0] if province == 'Québec': url = url.replace('/en/', '/fr/') mp_page = self.lxmlize(url) email = self.get_email(mp_page, '//*[@id="contact"]/div/p/a', error=False) photo = mp_page.xpath( './/div[@class="ce-mip-mp-profile-container"]//img/@src')[0] m = Person(primary_org='lower', name=name, district=constituency, role='MP', party=party) m.add_source(COUNCIL_PAGE) m.add_source(url) m.gender = gender # @see https://www.ourcommons.ca/Members/en/ziad-aboultaif(89156) if email: m.add_contact('email', email) if photo: # Determine whether the photo is actually a generic silhouette photo_response = self.get(photo) if (photo_response.status_code == 200 and hashlib.sha1(photo_response.content).hexdigest() not in IMAGE_PLACEHOLDER_SHA1): m.image = photo # I don't think the new parliment website has personal website anymore personal_url = mp_page.xpath( './/a[contains(@title, "Personal Web Site")]/@href') if personal_url: m.add_link(personal_url[0]) preferred_languages = mp_page.xpath( './/dt[contains(., "Preferred Language")]/following-sibling::dd/text()' ) if preferred_languages: m.extras['preferred_languages'] = [ language.replace('/', '').strip() for language in preferred_languages ] if province == 'Québec': m.add_contact('address', 'Chambre des communes\nOttawa ON K1A 0A6', 'legislature') else: m.add_contact('address', 'House of Commons\nOttawa ON K1A 0A6', 'legislature') # Hill Office contacts # Now phone and fax are in the same element # <p> # Telephone: xxx-xxx-xxxx<br/> # Fax: xxx-xxx-xxx # </p> phone_and_fax_el = mp_page.xpath( './/h4[contains(., "Hill Office")]/../p[contains(., "Telephone")]|.//h4[contains(., "Hill Office")]/../p[contains(., "Téléphone :")]' ) if len(phone_and_fax_el): phone_and_fax = phone_and_fax_el[0].text_content().strip( ).splitlines() voice = phone_and_fax[0].replace('Telephone:', '').replace( 'Téléphone :', '').strip() fax = phone_and_fax[1].replace('Fax:', '').replace( 'Télécopieur :', '').strip() if voice: m.add_contact('voice', voice, 'legislature') if fax: m.add_contact('fax', fax, 'legislature') # Constituency Office contacts # Some people has more than one, e.g. https://www.ourcommons.ca/Members/en/ben-lobb(35600)#contact for i, constituency_office_el in enumerate( mp_page.xpath( './/div[@class="ce-mip-contact-constituency-office-container"]/div' )): note = 'constituency' if i: note += ' ({})'.format(i + 1) address = constituency_office_el.xpath('./p[1]')[0] address = address.text_content().strip().splitlines() address = list(map(str.strip, address)) m.add_contact('address', '\n'.join(address), note) phone_and_fax_el = constituency_office_el.xpath( './p[contains(., "Telephone")]|./p[contains(., "Téléphone")]' ) if len(phone_and_fax_el): phone_and_fax = phone_and_fax_el[0].text_content().strip( ).splitlines() # Note that https://www.ourcommons.ca/Members/en/michael-barrett(102275)#contact # has a empty value - "Telephone:". So the search / replace cannot include space. voice = phone_and_fax[0].replace('Telephone:', '').replace( 'Téléphone :', '').strip() if len(phone_and_fax) > 1: fax = phone_and_fax[1].replace('Fax:', '').replace( 'Télécopieur :', '').strip() if voice: m.add_contact('voice', voice, note) if fax: m.add_contact('fax', fax, note) yield m