def parse_parl(self, data): name = '%(given_names)s %(family_name)s' % data if not self.data: self.data.update(data) del self.data['party_id'] del self.data['party_name'] del self.data['email'] self.data['id'] = idFactory.new('person') self.data['name'] = name assert self.data['name'] == name self.data.setdefault('identifiers', []).insert(0, { 'scheme': 'za.gov.parliament/person', 'identifier': '%(id)s' % data } ) if data.get('email'): if 'contact_details' in self.data: assert data['email'] == [ x for x in self.data['contact_details'] if x['type'] == 'email' ][0]['value'] else: self.data['contact_details'] = [ { 'type': 'email', 'value': data['email'] } ] party = self.organizations[data['party_name']] if 'identifiers' in party: assert party['identifiers'][0]['identifier'] == data['party_id'] else: party['identifiers'] = [ { 'scheme': 'za.gov.parliament/party', 'identifier': data['party_id'] } ] existing_party = [ x for x in self.data.get('memberships', []) if 'party' in x['organization_id'] ] if existing_party: assert party['id'] == existing_party[0]['organization_id'], party['id'] else: add_membership( self.data, { 'person_id': self.data['id'], 'organization_id': party['id'] } ) self.text = requests.get('http://www.parliament.gov.za/live/content.php?Item_ID=184&MemberID=%(id)s' % data).text self.parse_honorific() self.parse_table() self.parse_photo() self.parse_committees()
def parse(data): orgs_by_id = dict([ (x['id'], x) for x in data['organizations'].values() ]) # TODO: Perhaps check old/new committees, then stop using parl.py # committees. Or just assume these new ones are accurate. for row in csv.DictReader(open(data_path + 'committees.csv')): if row['Name'] not in data['organizations']: data['organizations'][row['Name']] = { 'id': idFactory.new('committee_pmg'), 'name': row['Name'], 'slug': row['Name'].lower().replace(' ','-'), 'classification': row['Type'] } for row in csv.DictReader(open(data_path + 'committee-members.csv')): row['Name'] = re.sub('^([^,]*) Mr, (.*)$', r'\1, Mr \2', row['Name']) family_name, initials = row['Name'].split(',') initials = re.sub('^\s*(Mr|Ms|Dr|Nkosi|Prof|Adv|Prince)\s+', '', initials) # TODO: Use the person's other_names filed, and get these misspellings in there. if family_name == 'Khorai': family_name = 'Khoarai' if family_name == 'Hoosan': family_name = 'Hoosen' if family_name == 'Jeffrey': family_name = 'Jeffery' if family_name == 'Hill-Lews': family_name = 'Hill-Lewis' if family_name == 'Koornhof' and initials == 'NC': initials = 'NJJVR' matches = [ x for x in data['persons'].values() if asciify(x['family_name']) == family_name ] if len(matches) > 1: matches = [ x for x in data['persons'].values() if x['family_name'] == family_name and initialise(x['given_names']) == initials ] if not matches: matches = [ x for x in data['persons'].values() if x['family_name'] == family_name and initialise(x['given_names'])[0:len(initials)] == initials ] # With the current data, we now always have one result assert len(matches) == 1 person = matches[0] party = [ x for x in person['memberships'] if 'party' in x['organization_id'] ][0]['organization_id'] assert row['Party'] == orgs_by_id[party]['name'], row['Party'] + orgs_by_id[party]['name'] mship = { 'organization_id': data['organizations'][row['Committee']]['id'] } if row['IsAlternative?'] == 'True': mship['role'] = 'Alternate Member' if row['IsChairperson?'] == 'True': mship['role'] = 'Chairperson' add_membership(person, mship) return data
def parse_committees(self): m = re.search('<td[^>]*><b[^>]*>Committees represented on: *</b></td>.*?<table[^>]*>(.*?)</table>(?s)', self.text) committees = dict(re.findall('<a href="content.php\?Item_ID=\d+&CommitteeID=(\d+)">(.*?)</a>', m.group(1))) for id, name in reversed(committees.items()): if name in self.organizations: assert self.organizations[name]['id'] == 'org.mysociety.za/committee/' + id else: self.organizations[name] = { 'id': 'org.mysociety.za/committee/' + id, 'name': name, 'identifiers': [ { 'scheme': 'za.gov.parliament/committee', 'identifier': id } ], 'slug': name.lower().replace(' ','-'), 'classification': 'committee' } add_membership(self.data, { 'person_id': self.data['id'], 'organization_id': self.organizations[name]['id'] } )
def parse(data): for person in data['persons'].values(): person['slug'] = person['name'].lower().replace(' ', '-') # There are three non-Assembly/NCOP people in the executive no_house = 0 for p in data['persons'].values(): if not [ y for y in p['memberships'] if 'house' in y['organization_id'] ]: no_house += 1 assert no_house == 3 na_manual = { 'Cassel Charlie Mathale': { 'start_date': '2013-07-15' }, 'Wayne Maxim Thring': { 'start_date': '2013-06-21' }, 'Masenyani Richard Baloyi': { 'end_date': '2013-07-10' }, 'Letlapa Moroatshoge Mphahlele': { 'end_date': '2013-07-11', 'end_reason': 'Ceased to be a member under section 47(3)(c) of the Constitution (changed party)' }, # 'Mpethi': { 'start_date': ? }, 'Ntopile Marcel Kganyago': { 'end_date': '2013-07-17', 'end_reason': 'Died' }, 'Nqabayomzi Lawrence Kwankwa': { 'start_date': '2013-08-06' }, 'Loretta Jacobus': { 'end_date': '2013-08-01' }, } ncop_manual = { 'Rory Dean MacPherson': { 'party': 'DA', 'end_date': '2009-05-29', 'province': 'KwaZulu-Natal' }, 'Robert Alfred Lees': { 'start_date': '2009-06-11' }, 'Sheery Su-Huei Cheng': { 'party': 'DA', 'end_date': '2010-09-30', 'province': 'Gauteng' }, 'Beverley Lynette Abrahams': { 'start_date': '2010-10-01' }, 'Timothy Duncan Harris': { 'party': 'DA', 'end_date': '2010-09-09', 'province': 'Western Cape' }, 'Theodorus Barnardus Beyleveldt': { 'party': 'DA', 'start_date': '2010-10-12', 'end_date': '2011-07-10', 'end_reason': 'Died', 'province': 'Western Cape' }, 'Denis Joseph': { 'start_date': '2011-10-20' }, 'Armiston Watson': { 'party': 'DA', 'end_date': '2011-11-07', 'province': 'Mpumalanga' }, 'Velly Makasana Manzini': { 'start_date': '2011-11-08' }, 'Tlhalefi Andries Mashamaite': { 'party': 'ANC', 'end_date': '2012-05-08', 'province': 'Limpopo' }, 'Thabo Lucas Makunyane': { 'start_date': '2012-05-22' }, 'Zukisa Cheryl Faku': { 'start_date': '2013-04-25' }, 'Mokoane Collen Maine': { 'end_date': '2013-08-01' }, # XXX } for person in data['persons'].values(): name = person['name'] mships = person['memberships'] mship = [ x for x in mships if 'ncop' in x['organization_id'] and x['role'] == 'Delegate' ] if mship: # Present, and has NCOP membership entry. Set a start and possibly end date. mship = mship[0] assert 'start_date' not in mship n = ncop_manual.pop(name, {}) mship['start_date'] = n.get('start_date', '2009-05-07') if 'end_date' in n and 'end_date' not in mship: mship['end_date'] = n['end_date'] elif name in ncop_manual: # Present, but has no NCOP membership entry n = ncop_manual.pop(name) add_membership( person, { 'organization_id': 'org.mysociety.za/house/ncop', 'label': 'Delegate for %s' % n['province'], 'role': 'Delegate', 'area': { 'id': 'org.mysociety.za/mapit/code/p/' + PROVINCES[n['province']], 'name': n['province'] }, 'start_date': n.get('start_date', '2009-05-07'), 'end_date': n['end_date'], 'end_reason': n.get('end_reason', 'Resigned'), }) mship = [ x for x in mships if 'house/na' in x['organization_id'] and x['role'] == 'Member' ] if mship: mship = mship[0] n = na_manual.pop(name, {}) if 'start_date' not in mship: mship['start_date'] = n.pop('start_date', '2009-05-06') if n: assert 'end_date' not in mship mship['end_date'] = n['end_date'] mship['end_reason'] = n.get('end_reason', 'Resigned') elif name in na_manual: raise Exception # The ones left have no person entry at all. for name, d in ncop_manual.items(): id = idFactory.new('person') given_names, family_name = name.rsplit(None, 1) person = { 'id': id, 'name': name, 'given_names': given_names, 'family_name': family_name, 'slug': name.lower().replace(' ', '-'), } add_membership( person, {'organization_id': data['organizations'][d['party']]['id']}) add_membership( person, { 'organization_id': 'org.mysociety.za/house/ncop', 'label': 'Delegate for %s' % d['province'], 'role': 'Delegate', 'area': { 'id': 'org.mysociety.za/mapit/code/p/' + PROVINCES[d['province']], 'name': d['province'] }, 'start_date': d.get('start_date', '2009-05-07'), 'end_date': d['end_date'], 'end_reason': d.get('end_reason', 'Resigned'), }) data['persons'][name] = person return data
def parse(data): for person in data['persons'].values(): person['slug'] = person['name'].lower().replace(' ', '-') # There are three non-Assembly/NCOP people in the executive no_house = 0 for p in data['persons'].values(): if not [ y for y in p['memberships'] if 'house' in y['organization_id'] ]: no_house += 1 assert no_house == 3 na_manual = { 'Cassel Charlie Mathale': { 'start_date': '2013-07-15' }, 'Wayne Maxim Thring': { 'start_date': '2013-06-21' }, 'Masenyani Richard Baloyi': { 'end_date': '2013-07-10' }, 'Letlapa Moroatshoge Mphahlele': { 'end_date': '2013-07-11', 'end_reason': 'Ceased to be a member under section 47(3)(c) of the Constitution (changed party)' }, # 'Mpethi': { 'start_date': ? }, 'Ntopile Marcel Kganyago': { 'end_date': '2013-07-17', 'end_reason': 'Died' }, 'Nqabayomzi Lawrence Kwankwa': { 'start_date': '2013-08-06' }, 'Loretta Jacobus': { 'end_date': '2013-08-01' }, } ncop_manual = { 'Rory Dean MacPherson': { 'party': 'DA', 'end_date': '2009-05-29', 'province': 'KwaZulu-Natal' }, 'Robert Alfred Lees': { 'start_date': '2009-06-11' }, 'Sheery Su-Huei Cheng': { 'party': 'DA', 'end_date': '2010-09-30', 'province': 'Gauteng' }, 'Beverley Lynette Abrahams': { 'start_date': '2010-10-01' }, 'Timothy Duncan Harris': { 'party': 'DA', 'end_date': '2010-09-09', 'province': 'Western Cape' }, 'Theodorus Barnardus Beyleveldt': { 'party': 'DA', 'start_date': '2010-10-12', 'end_date': '2011-07-10', 'end_reason': 'Died', 'province': 'Western Cape' }, 'Denis Joseph': { 'start_date': '2011-10-20' }, 'Armiston Watson': { 'party': 'DA', 'end_date': '2011-11-07', 'province': 'Mpumalanga' }, 'Velly Makasana Manzini': { 'start_date': '2011-11-08' }, 'Tlhalefi Andries Mashamaite': { 'party': 'ANC', 'end_date': '2012-05-08', 'province': 'Limpopo' }, 'Thabo Lucas Makunyane': { 'start_date': '2012-05-22' }, 'Zukisa Cheryl Faku': { 'start_date': '2013-04-25' }, 'Mokoane Collen Maine': { 'end_date': '2013-08-01' }, # XXX } for person in data['persons'].values(): name = person['name'] mships = person['memberships'] mship = [ x for x in mships if 'ncop' in x['organization_id'] and x['role'] == 'Delegate' ] if mship: # Present, and has NCOP membership entry. Set a start and possibly end date. mship = mship[0] assert 'start_date' not in mship n = ncop_manual.pop(name, {}) mship['start_date'] = n.get('start_date', '2009-05-07') if 'end_date' in n and 'end_date' not in mship: mship['end_date'] = n['end_date'] elif name in ncop_manual: # Present, but has no NCOP membership entry n = ncop_manual.pop(name) add_membership(person, { 'organization_id': 'org.mysociety.za/house/ncop', 'label': 'Delegate for %s' % n['province'], 'role': 'Delegate', 'area': { 'id': 'org.mysociety.za/mapit/code/p/' + PROVINCES[n['province']], 'name': n['province'] }, 'start_date': n.get('start_date', '2009-05-07'), 'end_date': n['end_date'], 'end_reason': n.get('end_reason', 'Resigned'), }) mship = [ x for x in mships if 'house/na' in x['organization_id'] and x['role'] == 'Member' ] if mship: mship = mship[0] n = na_manual.pop(name, {}) if 'start_date' not in mship: mship['start_date'] = n.pop('start_date', '2009-05-06') if n: assert 'end_date' not in mship mship['end_date'] = n['end_date'] mship['end_reason'] = n.get('end_reason', 'Resigned') elif name in na_manual: raise Exception # The ones left have no person entry at all. for name, d in ncop_manual.items(): id = idFactory.new('person') given_names, family_name = name.rsplit(None, 1) person = { 'id': id, 'name': name, 'given_names': given_names, 'family_name': family_name, 'slug': name.lower().replace(' ', '-'), } add_membership(person, { 'organization_id': data['organizations'][d['party']]['id'] }) add_membership(person, { 'organization_id': 'org.mysociety.za/house/ncop', 'label': 'Delegate for %s' % d['province'], 'role': 'Delegate', 'area': { 'id': 'org.mysociety.za/mapit/code/p/' + PROVINCES[d['province']], 'name': d['province'] }, 'start_date': d.get('start_date', '2009-05-07'), 'end_date': d['end_date'], 'end_reason': d.get('end_reason', 'Resigned'), }) data['persons'][name] = person return data
def parse(): for row in FixingDictReader( open(data_path + 'myreps_na_executive_export.csv')): person_bits = dict((col_map(k), v) for k, v in row.items() if k in ('first_name', 'last_name', 'initials_alt', 'other_names', 'title', 'email') and v) position_bits = dict( (col_map(k), v) for k, v in row.items() if k in ('start_date', 'end_date', 'end_reason', 'organisation', 'position', 'region') and v and v != 'Member' and v != 'National') if 'end_date' not in position_bits: del position_bits['end_reason'] if 'end_date' in position_bits and position_bits['end_reason'] == '0': del position_bits['end_reason'] if person_bits['given_names'] == 'Tlhalefi Andries': continue # Comes in elsewhere # Manual fixes of file fix_person_bits(person_bits) fix_end_reason(position_bits, person_bits) name = '%(given_names)s %(family_name)s' % person_bits person_bits['name'] = name if person_bits.get('email'): person_bits['contact_details'] = [{ 'type': 'email', 'value': person_bits.pop('email') }] if 'other_names' in person_bits: person_bits['other_names'] = [{'name': person_bits['other_names']}] if position_bits['organisation'] not in ORGANIZATIONS: ORGANIZATIONS.setdefault( position_bits['organisation'], { 'id': 'org.mysociety.za/party/' + position_bits['organisation'].lower(), 'name': position_bits['organisation'], 'slug': position_bits['organisation'].lower(), 'classification': 'party' }) position_bits['organization_id'] = ORGANIZATIONS[ position_bits['organisation']]['id'] del position_bits['organisation'] if position_bits[ 'organization_id'] == 'org.mysociety.za/house/national-assembly' and 'role' not in position_bits: position_bits['label'] = position_bits['role'] = 'Member' elif position_bits['organization_id'] == 'org.mysociety.za/house/ncop': position_bits['label'] = position_bits['role'] = 'Delegate' if 'end_reason' in position_bits: position_bits['end_reason'] = REASONS[position_bits['end_reason']] if position_bits.get('region'): r = position_bits['region'] position_bits['area'] = { 'id': 'org.mysociety.za/mapit/code/p/' + PROVINCES[r], 'name': r } position_bits['label'] += ' for ' + r del position_bits['region'] if name in PEOPLE: person_bits['id'] = PEOPLE[name]['person']['id'] assert PEOPLE[name]['person'] == person_bits else: person_bits['id'] = idFactory.new('person') PEOPLE[name] = {'id': person_bits['id'], 'person': person_bits} add_membership(PEOPLE[name], position_bits) # National Assembly MyReps site data # To fetch myreps ID and PERSON_ID na = open(data_path + 'myreps-na.xml').read() people = ET.fromstring(na).iter('Members') cols_xml = [ 'id', 'person_id', 'person_first_name', 'person_last_name', 'person_paries' ] for person in people: row = dict(zip(cols_xml, [person.find(x).text for x in cols_xml])) if row['person_first_name'] == 'Nomaindiya Cathleen': row['person_first_name'] = 'NomaIndiya Cathleen' if row['person_first_name'] == 'Alpheus' and row[ 'person_last_name'] == 'Mokabhe': row.update(person_first_name='Alpheus Mokabhe', person_last_name='Maziya') if row['person_first_name'] == 'Ximbi': row.update(person_first_name='Dumsani Livingstone', person_last_name='Ximbi') name = '%(person_first_name)s %(person_last_name)s' % row name = fix_bad_encoding(name.encode('utf-8')) PEOPLE[name]['person']['identifiers'] = [ { 'identifier': row['person_id'], 'scheme': 'myreps_person_id' }, ] if row['id']: PEOPLE[name]['person']['identifiers'].append({ 'identifier': row['id'], 'scheme': 'myreps_id' }) na_prev = open(data_path + 'myreps-national-assembly.html').read() na_prev = re.search('<div[^>]*id="past"[^>]*>.*?</div>(?s)', na_prev).group(0) for person in re.findall( '<li><a href="/people/view/(.*?)">([^<]*) ([^<]*?)</a> until .*?</li>', na_prev): row = dict(zip(cols_xml, ['', person[0], person[1], person[2], ''])) if row['person_first_name'] == 'Patricia de': row.update(person_first_name='Patricia', person_last_name='de Lille') if row['person_first_name'] == 'D van der': row.update(person_first_name='D', person_last_name='van der Walt') name = '%(person_first_name)s %(person_last_name)s' % row PEOPLE[name]['person']['identifiers'] = [ { 'identifier': row['person_id'], 'scheme': 'myreps_person_id' }, ] # NCOP MyReps site data ncop = open(data_path + 'myreps-ncop.xml').read() people = ET.fromstring(ncop).iter('Members') for person in people: row = dict(zip(cols_xml, [person.find(x).text for x in cols_xml])) # Change couple of names to match parliament data if row['person_first_name'] == 'Arthur': row['person_first_name'] = 'Robert Alfred' elif row['person_first_name'] == 'Buoang Lemias': row['person_first_name'] = 'Budang Lemias' name = '%(person_first_name)s %(person_last_name)s' % row id = idFactory.new('person') PEOPLE[name] = { 'id': id, 'person': { 'id': id, 'given_names': row['person_first_name'], 'family_name': row['person_last_name'], 'name': name, 'identifiers': [ { 'identifier': row['id'], 'scheme': 'myreps_id' }, { 'identifier': row['person_id'], 'scheme': 'myreps_person_id' }, ] }, } add_membership( PEOPLE[name], { 'organization_id': 'org.mysociety.za/house/ncop', 'label': 'Delegate', 'role': 'Delegate' }) if row['id'] == '7852': # Special case of one person resigned since data PEOPLE[name]['memberships'][0].update( end_date='2013-03-27', end_reason='Resigned', label='Delegate for Eastern Cape', area={ 'id': 'org.mysociety.za/mapit/code/p/' + PROVINCES['Eastern Cape'], 'name': 'Eastern Cape' }) if row['person_paries']: add_membership( PEOPLE[name], {'organization_id': ORGANIZATIONS[row['person_paries']]['id']}) for name in PEOPLE.keys(): PEOPLE[name]['person'].update(memberships=PEOPLE[name]['memberships']) PEOPLE[name] = PEOPLE[name]['person'] return { 'persons': PEOPLE, 'organizations': ORGANIZATIONS, }
def parse(data): orgs_by_id = dict([(x['id'], x) for x in data['organizations'].values()]) # TODO: Perhaps check old/new committees, then stop using parl.py # committees. Or just assume these new ones are accurate. for row in csv.DictReader(open(data_path + 'committees.csv')): if row['Name'] not in data['organizations']: data['organizations'][row['Name']] = { 'id': idFactory.new('committee_pmg'), 'name': row['Name'], 'slug': row['Name'].lower().replace(' ', '-'), 'classification': row['Type'] } for row in csv.DictReader(open(data_path + 'committee-members.csv')): row['Name'] = re.sub('^([^,]*) Mr, (.*)$', r'\1, Mr \2', row['Name']) family_name, initials = row['Name'].split(',') initials = re.sub('^\s*(Mr|Ms|Dr|Nkosi|Prof|Adv|Prince)\s+', '', initials) # TODO: Use the person's other_names filed, and get these misspellings in there. if family_name == 'Khorai': family_name = 'Khoarai' if family_name == 'Hoosan': family_name = 'Hoosen' if family_name == 'Jeffrey': family_name = 'Jeffery' if family_name == 'Hill-Lews': family_name = 'Hill-Lewis' if family_name == 'Koornhof' and initials == 'NC': initials = 'NJJVR' matches = [ x for x in data['persons'].values() if asciify(x['family_name']) == family_name ] if len(matches) > 1: matches = [ x for x in data['persons'].values() if x['family_name'] == family_name and initialise(x['given_names']) == initials ] if not matches: matches = [ x for x in data['persons'].values() if x['family_name'] == family_name and initialise( x['given_names'])[0:len(initials)] == initials ] # With the current data, we now always have one result assert len(matches) == 1 person = matches[0] party = [ x for x in person['memberships'] if 'party' in x['organization_id'] ][0]['organization_id'] assert row['Party'] == orgs_by_id[party][ 'name'], row['Party'] + orgs_by_id[party]['name'] mship = { 'organization_id': data['organizations'][row['Committee']]['id'] } if row['IsAlternative?'] == 'True': mship['role'] = 'Alternate Member' if row['IsChairperson?'] == 'True': mship['role'] = 'Chairperson' add_membership(person, mship) return data
def parse(): for row in FixingDictReader(open(data_path + "myreps_na_executive_export.csv")): person_bits = dict( (col_map(k), v) for k, v in row.items() if k in ("first_name", "last_name", "initials_alt", "other_names", "title", "email") and v ) position_bits = dict( (col_map(k), v) for k, v in row.items() if k in ("start_date", "end_date", "end_reason", "organisation", "position", "region") and v and v != "Member" and v != "National" ) if "end_date" not in position_bits: del position_bits["end_reason"] if "end_date" in position_bits and position_bits["end_reason"] == "0": del position_bits["end_reason"] if person_bits["given_names"] == "Tlhalefi Andries": continue # Comes in elsewhere # Manual fixes of file fix_person_bits(person_bits) fix_end_reason(position_bits, person_bits) name = "%(given_names)s %(family_name)s" % person_bits person_bits["name"] = name if person_bits.get("email"): person_bits["contact_details"] = [{"type": "email", "value": person_bits.pop("email")}] if "other_names" in person_bits: person_bits["other_names"] = [{"name": person_bits["other_names"]}] if position_bits["organisation"] not in ORGANIZATIONS: ORGANIZATIONS.setdefault( position_bits["organisation"], { "id": "org.mysociety.za/party/" + position_bits["organisation"].lower(), "name": position_bits["organisation"], "slug": position_bits["organisation"].lower(), "classification": "party", }, ) position_bits["organization_id"] = ORGANIZATIONS[position_bits["organisation"]]["id"] del position_bits["organisation"] if ( position_bits["organization_id"] == "org.mysociety.za/house/national-assembly" and "role" not in position_bits ): position_bits["label"] = position_bits["role"] = "Member" elif position_bits["organization_id"] == "org.mysociety.za/house/ncop": position_bits["label"] = position_bits["role"] = "Delegate" if "end_reason" in position_bits: position_bits["end_reason"] = REASONS[position_bits["end_reason"]] if position_bits.get("region"): r = position_bits["region"] position_bits["area"] = {"id": "org.mysociety.za/mapit/code/p/" + PROVINCES[r], "name": r} position_bits["label"] += " for " + r del position_bits["region"] if name in PEOPLE: person_bits["id"] = PEOPLE[name]["person"]["id"] assert PEOPLE[name]["person"] == person_bits else: person_bits["id"] = idFactory.new("person") PEOPLE[name] = {"id": person_bits["id"], "person": person_bits} add_membership(PEOPLE[name], position_bits) # National Assembly MyReps site data # To fetch myreps ID and PERSON_ID na = open(data_path + "myreps-na.xml").read() people = ET.fromstring(na).iter("Members") cols_xml = ["id", "person_id", "person_first_name", "person_last_name", "person_paries"] for person in people: row = dict(zip(cols_xml, [person.find(x).text for x in cols_xml])) if row["person_first_name"] == "Nomaindiya Cathleen": row["person_first_name"] = "NomaIndiya Cathleen" if row["person_first_name"] == "Alpheus" and row["person_last_name"] == "Mokabhe": row.update(person_first_name="Alpheus Mokabhe", person_last_name="Maziya") if row["person_first_name"] == "Ximbi": row.update(person_first_name="Dumsani Livingstone", person_last_name="Ximbi") name = "%(person_first_name)s %(person_last_name)s" % row name = fix_bad_encoding(name.encode("utf-8")) PEOPLE[name]["person"]["identifiers"] = [{"identifier": row["person_id"], "scheme": "myreps_person_id"}] if row["id"]: PEOPLE[name]["person"]["identifiers"].append({"identifier": row["id"], "scheme": "myreps_id"}) na_prev = open(data_path + "myreps-national-assembly.html").read() na_prev = re.search('<div[^>]*id="past"[^>]*>.*?</div>(?s)', na_prev).group(0) for person in re.findall('<li><a href="/people/view/(.*?)">([^<]*) ([^<]*?)</a> until .*?</li>', na_prev): row = dict(zip(cols_xml, ["", person[0], person[1], person[2], ""])) if row["person_first_name"] == "Patricia de": row.update(person_first_name="Patricia", person_last_name="de Lille") if row["person_first_name"] == "D van der": row.update(person_first_name="D", person_last_name="van der Walt") name = "%(person_first_name)s %(person_last_name)s" % row PEOPLE[name]["person"]["identifiers"] = [{"identifier": row["person_id"], "scheme": "myreps_person_id"}] # NCOP MyReps site data ncop = open(data_path + "myreps-ncop.xml").read() people = ET.fromstring(ncop).iter("Members") for person in people: row = dict(zip(cols_xml, [person.find(x).text for x in cols_xml])) # Change couple of names to match parliament data if row["person_first_name"] == "Arthur": row["person_first_name"] = "Robert Alfred" elif row["person_first_name"] == "Buoang Lemias": row["person_first_name"] = "Budang Lemias" name = "%(person_first_name)s %(person_last_name)s" % row id = idFactory.new("person") PEOPLE[name] = { "id": id, "person": { "id": id, "given_names": row["person_first_name"], "family_name": row["person_last_name"], "name": name, "identifiers": [ {"identifier": row["id"], "scheme": "myreps_id"}, {"identifier": row["person_id"], "scheme": "myreps_person_id"}, ], }, } add_membership( PEOPLE[name], {"organization_id": "org.mysociety.za/house/ncop", "label": "Delegate", "role": "Delegate"} ) if row["id"] == "7852": # Special case of one person resigned since data PEOPLE[name]["memberships"][0].update( end_date="2013-03-27", end_reason="Resigned", label="Delegate for Eastern Cape", area={"id": "org.mysociety.za/mapit/code/p/" + PROVINCES["Eastern Cape"], "name": "Eastern Cape"}, ) if row["person_paries"]: add_membership(PEOPLE[name], {"organization_id": ORGANIZATIONS[row["person_paries"]]["id"]}) for name in PEOPLE.keys(): PEOPLE[name]["person"].update(memberships=PEOPLE[name]["memberships"]) PEOPLE[name] = PEOPLE[name]["person"] return {"persons": PEOPLE, "organizations": ORGANIZATIONS}
def parse_table(self): m = re.findall('<td height="25" valign="middle" class="pad"><b>(.*?):</b></td>\s*<td width="70%" valign="middle" class="pad">(.*?)</td>(?s)', self.text) m = dict((k,v) for k, v in m if v not in ('-', '<a href = mailto:></a>')) for contact_detail in ('Constituency Fax Number', 'Session Fax Number', 'Cell Phone Number', 'Constituency Phone Number', 'Session Phone Number', 'Constituency Postal Address', 'Constituency Street Address'): if contact_detail in m: if '<a target' in m[contact_detail]: continue if 'Fax' in contact_detail: type = 'fax' elif 'Cell' in contact_detail: type = 'cell' elif 'Number' in contact_detail: type = 'voice' elif 'Address' in contact_detail: type = 'address' self.data.setdefault('contact_details', []).append( { 'type': type, 'value': m.pop(contact_detail), 'note': contact_detail } ) house = self.organizations[m.pop('House')]['id'] province = None if 'Delegate of Province' in m: province = m.pop('Delegate of Province') label = 'Delegate' elif 'Province' in m: province = m.pop('Province') label = 'Member' if self.data['name'] in ('Nqabayomzi Lawrence Kwankwa', 'Cassel Charlie Mathale', 'Wayne Maxim Thring'): label = 'Member' existing_house = [ x for x in self.data['memberships'] if 'house' in x['organization_id'] ] if existing_house: assert existing_house[0]['organization_id'] == house if province: if 'area' in existing_house[0]: assert existing_house[0]['area']['name'] == province else: existing_house[0]['area'] = { 'id': 'org.mysociety.za/mapit/code/p/' + PROVINCES[province], 'name': province } existing_house[0]['label'] = label + ' for ' + province else: # Faku dat = { 'person_id': self.data['id'], 'organization_id': house, 'role': label, 'label': label } if province: dat['label'] = label + ' for ' + province dat['area'] = { 'id': 'org.mysociety.za/mapit/code/p/' + PROVINCES[province], 'name': province } add_membership(self.data, dat) if 'Position(s)' in m: posns = [ { 'role': x.strip() } for x in m.pop('Position(s)').split('<br />') if x.strip() != 'Delegate' ] if len(posns) == 1 and posns[0]['role'] == 'Correctional Services': posns[0]['role'] = 'Minister of ' + posns[0]['role'] elif len(posns) == 1 and posns[0]['role'] == 'Public Service and Administration': posns[0]['role'] = 'Minister for the ' + posns[0]['role'] elif len(posns) == 2 and posns[0]['role'] == 'Minister in The Presidency' and posns[1]['role'] == 'Performance Monitoring and Evaluation as well as Administration in the Presidency': posns = [ { 'role': 'Minister in The Presidency: Performance, Monitoring and Evaluation as well as Administration' } ] posns = [ x for x in posns if 'Minister' not in x['role'] and x['role'] not in ('Deputy President', 'The Chief Whip of the Opposition', 'House Chairperson', 'Leader Of Opposition', 'Chief Whip of the Opposition', 'Deputy Speaker of the National Assembly', 'Speaker of the National Assembly' ) ] # These come from step 1 for p in posns: p.update(person_id=self.data['id'], organization_id=house) add_membership(self.data, p)