def parse_individual(emit, record, ind): last_name = ind.findtext('.//THIRD_NAME') second_name = ind.findtext('.//SECOND_NAME') if last_name is None: last_name = second_name second_name = None record.update({ 'first_name': ind.findtext('.//FIRST_NAME'), 'second_name': second_name, 'last_name': last_name }) for alias in ind.findall('./INDIVIDUAL_ALIAS'): parse_alias(alias, record) for addr in ind.findall('./INDIVIDUAL_ADDRESS'): parse_address(addr, record) for ident in ind.findall('./INDIVIDUAL_DOCUMENT'): country = normalize_country(ident.findtext('./COUNTRY_OF_ISSUE')) number = ident.findtext('./NUMBER') if number is None and country is None: continue record['identities'].append({ 'type': ident.findtext('./TYPE_OF_DOCUMENT'), 'number': number, 'country': country, }) for dob in ind.findall('./INDIVIDUAL_DATE_OF_BIRTH'): date = dob.findtext('./DATE') if date is None: continue if ':' in date: date = date.rsplit('-', 1)[0] approx = dob.findtext('./TYPE_OF_DATE') == 'Approximately' if approx and 'date_of_birth' in record: continue record['date_of_birth'] = date for pob in ind.findall('./INDIVIDUAL_PLACE_OF_BIRTH'): country = pob.findtext('./COUNTRY') record['country_of_birth'] = normalize_country(country) place = pob.findtext('./CITY') region = pob.findtext('./STATE_PROVINCE') if place and region: record['place_of_birth'] = '%s (%s)' % (place, region) if place: record['place_of_birth'] = place if region: record['place_of_birth'] = region emit.entity(record)
def parse_individual(emit, record, ind): record.update({ 'first_name': ind.findtext('.//FIRST_NAME'), 'last_name': ind.findtext('.//LAST_NAME') }) for alias in ind.findall('./INDIVIDUAL_ALIAS'): parse_alias(alias, record) for addr in ind.findall('./INDIVIDUAL_ADDRESS'): parse_address(addr, record) for ident in ind.findall('./INDIVIDUAL_DOCUMENT'): country = normalize_country(ident.findtext('./COUNTRY_OF_ISSUE')) number = ident.findtext('./NUMBER') if number is None and country is None: continue record['identities'].append({ 'type': ident.findtext('./TYPE_OF_DOCUMENT'), 'number': number, 'country': country, }) for dob in ind.findall('./INDIVIDUAL_DATE_OF_BIRTH'): date = dob.findtext('./DATE') if date is None: continue if ':' in date: date = date.rsplit('-', 1)[0] approx = dob.findtext('./TYPE_OF_DATE') == 'Approximately' if approx and 'date_of_birth' in record: continue record['date_of_birth'] = date for pob in ind.findall('./INDIVIDUAL_PLACE_OF_BIRTH'): country = pob.findtext('./COUNTRY') record['country_of_birth'] = normalize_country(country) place = pob.findtext('./CITY') region = pob.findtext('./STATE_PROVINCE') if place and region: record['place_of_birth'] = '%s (%s)' % (place, region) if place: record['place_of_birth'] = place if region: record['place_of_birth'] = region emit.entity(record)
def parse_case(emit, case): url = case.get('url') name = combine_name(*reversed(case.get('name').split(', '))) updated = dateutil_parse(case.get('last_updated')) record = { 'uid': make_id('interpol', url.split('/')[-1]), 'source_url': url, 'name': name, 'summary': case.get('reason'), 'updated_at': updated.date().isoformat(), 'place_of_birth': case.get('place_of_birth'), 'gender': case.get('sex', '').lower(), 'first_name': case.get('forename'), 'last_name': case.get('present_family_name'), 'nationality': normalize_country(case.get('nationality')), 'identities': [], 'addresses': [], 'other_names': [] } record.update(SOURCE) birth = case.get('date_of_birth').split(' ')[0] try: dt = datetime.strptime(birth, '%Y').date().isoformat() record['date_of_birth'] = dt except Exception: try: dt = datetime.strptime(birth, '%d/%m/%Y').date().isoformat() record['date_of_birth'] = dt except Exception as ex: log.exception(ex) emit.entity(record)
def parse_common(node, type_): program_ref = '%s (%s)' % (node.findtext('./UN_LIST_TYPE').strip(), node.findtext('./REFERENCE_NUMBER').strip()) record = { 'uid': make_id('un', 'sc', node.findtext('./DATAID')), 'type': type_, 'program': program_ref, 'summary': node.findtext('./COMMENTS1'), 'name': combine_name(node.findtext('./FIRST_NAME'), node.findtext('./SECOND_NAME'), node.findtext('./THIRD_NAME')), 'function': node.findtext('./DESIGNATION/VALUE'), 'updated_at': node.findtext('./LISTED_ON'), 'nationality': normalize_country(node.findtext('./NATIONALITY/VALUE')), 'other_names': [], 'addresses': [], 'identities': [] } record.update(BASE) orig = node.findtext('./NAME_ORIGINAL_SCRIPT') if orig is not None: record['name'] = orig last_updated = node.findtext('./LAST_DAY_UPDATED/VALUE') if last_updated is not None: record['updated_at'] = last_updated if ':' in record['updated_at']: record['updated_at'] = record['updated_at'].rsplit('-', 1)[0] # print etree.tostring(node, pretty_print=True) return record
def parse_addr(addr, places): if not len(addr.getchildren()): addr = places.get(addr.get('place-id')) country = addr.find('./country') data = { 'text': addr.findtext('./remarks') or addr.findtext('./location'), 'address1': addr.findtext('./address-details'), 'address2': addr.findtext('./p-o-box'), 'postal_code': addr.findtext('./zip-code'), 'address1': addr.findtext('./address-details'), 'region': addr.findtext('./area'), 'country': normalize_country(addr.findtext('./country')) } if country is not None and country.get('iso-code'): data['country'] = normalize_country(country.get('iso-code')) return data
def wbdeb_parse(emit, html_file): doc = html.parse(html_file) for table in doc.findall('//table'): if 'List of Debarred' not in table.get('summary', ''): continue rows = table.findall('.//tr') print table.get('summary'), len(rows) for row in rows: tds = row.findall('./td') if len(tds) != 6: continue values = [clean_value(td) for td in tds] uid = sha1() for value in values: uid.update(value.encode('utf-8')) uid = uid.hexdigest()[:10] names = clean_name(values[0]) if not len(names): log.warning("No name: %r", values) continue record = { 'uid': make_id('wb', 'debarred', uid), 'name': values[0], 'nationality': normalize_country(values[2]), 'program': values[5], 'addresses': [{ 'text': values[1], 'country': normalize_country(values[2]) }], 'other_names': [], 'updated_at': dateutil_parse(values[3]).date().isoformat() } for name in names[1:]: record['other_names'].append({'other_name': name}) record.update(SOURCE) emit.entity(record)
def parse_row(emit, row): record = SOURCE.copy() record.update({ 'uid': make_id('us', 'bis', row.get('Effective_Date'), row.get('Name')), 'name': row.get('Name'), 'program': row.get('FR_Citation'), 'summary': row.get('Action'), 'updated_at': row.get('Last Update'), 'nationality': normalize_country(row.get('Country')), 'addresses': [{ 'address1': row.get('Street_Address'), 'postal_code': row.get('Postal_Code'), 'region': row.get('State'), 'city': row.get('City'), 'country': normalize_country(row.get('Country')) }] }) emit.entity(record)
def wbdeb_parse(emit, html_file): doc = html.parse(html_file) for table in doc.findall('//table'): if 'List of Debarred' not in table.get('summary', ''): continue rows = table.findall('.//tr') print table.get('summary'), len(rows) for row in rows: tds = row.findall('./td') if len(tds) != 6: continue values = [clean_value(td) for td in tds] uid = sha1() for value in values: uid.update(value.encode('utf-8')) uid = uid.hexdigest()[:10] names = clean_name(values[0]) if not len(names): log.warning("No name: %r", values) continue record = { 'uid': make_id('wb', 'debarred', uid), 'name': values[0], 'nationality': normalize_country(values[2]), 'program': values[5], 'addresses': [{ 'text': values[1], 'country': normalize_country(values[2]) }], 'other_names': [], 'updated_at': dateutil_parse(values[3]).date().isoformat() } for name in names[1:]: record['other_names'].append({ 'other_name': name }) record.update(SOURCE) emit.entity(record)
def parse_address(addr, record): data = { 'text': addr.findtext('./NOTE'), 'address1': addr.findtext('./STREET'), 'city': addr.findtext('./CITY'), 'region': addr.findtext('./STATE_PROVINCE'), 'country': normalize_country(addr.findtext('./COUNTRY')), } exist = set(data.values()) if exist == 1 and exist[0] is None: return record['addresses'].append(data)
def parse_identity(record, ident, places): main = ident.get('main') == 'true' # print ident, record['type'], record['program'] for name in ident.findall('./name'): parse_name(name, record, main) for dob in ident.findall('./day-month-year'): if main or 'date_of_birth' not in record: record['date_of_birth'] = parse_date(dob) for pob in ident.findall('./place-of-birth'): addr = parse_addr(pob, places) parts = [addr.get('text'), addr.get('address1'), addr.get('address2'), addr.get('city'), addr.get('region')] parts = [p for p in parts if p is not None] if main or 'place_of_birth' not in record: if len(parts): record['place_of_birth'] = ' / '.join(parts) if main or 'country_of_birth' not in record: if addr.get('country'): record['country_of_birth'] = addr.get('country') for addr in ident.findall('./address'): record['addresses'].append(parse_addr(addr, places)) for doc in ident.findall('./identification-document'): country = doc.find('./issuer') if country is not None: if country.get('code'): country = normalize_country(country.get('code')) else: country = normalize_country(country.text) record['identities'].append({ 'country': country, 'type': doc.get('document-type'), 'number': doc.findtext('./number'), })
def everypolitician_parse(emit, json_file): with open(json_file, 'r') as fh: data = json.load(fh) for policitian in data.get('politicians'): # from pprint import pprint # pprint(policitian) # TODO: add politician country = normalize_country(policitian.get('country_code')) entity = { 'uid': make_id('evpo', policitian.get('id').split('-')[-1]), 'name': policitian.get('name'), 'type': 'individual', 'addresses': [{'country': country}], 'updated_at': parse_ts(policitian.get('legislature_lastmod')), 'source_url': policitian.get('source_url') } entity.update(PUBLISHER) emit.entity(entity)
def parse_common(node, type_): program_ref = '%s (%s)' % (node.findtext('./UN_LIST_TYPE').strip(), node.findtext('./REFERENCE_NUMBER').strip()) record = { 'uid': make_id('un', 'sc', node.findtext('./DATAID')), 'type': type_, 'program': program_ref, 'summary': node.findtext('./COMMENTS1'), 'name': combine_name(node.findtext('./FIRST_NAME'), node.findtext('./LAST_NAME')), 'function': node.findtext('./DESIGNATION/VALUE'), 'updated_at': node.findtext('./LISTED_ON'), 'nationality': normalize_country(node.findtext('./NATIONALITY/VALUE')), 'other_names': [], 'addresses': [], 'identities': [] } record.update(BASE) orig = node.findtext('./NAME_ORIGINAL_SCRIPT') if orig is not None: record['name'] = orig last_updated = node.findtext('./LAST_DAY_UPDATED/VALUE') if last_updated is not None: record['updated_at'] = last_updated if ':' in record['updated_at']: record['updated_at'] = record['updated_at'].rsplit('-', 1)[0] # print etree.tostring(node, pretty_print=True) return record
def everypolitician_parse(emit, json_file): with open(json_file, 'r') as fh: data = json.load(fh) for policitian in data.get('politicians'): # from pprint import pprint # pprint(policitian) # TODO: add politician country = normalize_country(policitian.get('country_code')) entity = { 'uid': make_id('evpo', policitian.get('id').split('-')[-1]), 'name': policitian.get('name'), 'type': 'individual', 'addresses': [{ 'country': country }], 'updated_at': parse_ts(policitian.get('legislature_lastmod')), 'source_url': policitian.get('source_url') } entity.update(PUBLISHER) emit.entity(entity)
def worldleaders_parse(emit, json_file): with open(json_file, 'r') as fh: data = json.load(fh) for leader in data.get('leaders'): if not len(leader.get('name')): log.warning('No name on entity: %(title)s (%(country)s)', leader) continue country = normalize_country(leader.get('country')) summary = leader.get('title') if leader.get('component'): summary = '%s (%s)' % (summary, leader.get('component')) entity = { 'uid': make_id('us', 'cia', 'worldleaders', country, leader.get('name')), 'name': leader.get('name'), 'type': 'individual', 'summary': summary, 'addresses': [{'country': country}], 'updated_at': parse_date(leader.get('last_update')), 'source_url': leader.get('url') } entity.update(PUBLISHER) emit.entity(entity)
def parse_entry(emit, record, entry): uid = entry.get('Id') record.update({ 'uid': make_id('eu', 'eeas', uid), 'type': 'individual', 'updated_at': entry.get('reg_date'), 'source_url': entry.get('pdf_link'), 'program': entry.get('programme'), 'summary': entry.get('remark') }) is_entity = entry.get('Type') != 'P' if is_entity: record['type'] = 'entity' record['other_names'] = [] for aka in entry.findall('./NAME'): data = { 'first_name': aka.findtext('./FIRSTNAME'), 'last_name': aka.findtext('./LASTNAME'), 'middle_name': aka.findtext('./MIDDLENAME'), 'other_name': aka.findtext('./WHOLENAME') } funct = aka.findtext('./FUNCTION') if funct and len(funct) > len(record.get('function', '')): record['function'] = funct gender = aka.findtext('./GENDER') if gender == 'M': data['gender'] = 'male' if gender == 'F': data['gender'] = 'female' if 'name' not in record: record['name'] = data.pop('other_name') record.update(data) else: record['other_names'].append(data) record['identities'] = [] for passport in entry.findall('./PASSPORT'): data = { 'type': 'Passport', 'number': passport.findtext('./NUMBER'), 'country': normalize_country(passport.findtext('./COUNTRY')) } record['identities'].append(data) record['addresses'] = [] for address in entry.findall('./ADDRESS'): data = { 'address1': address.findtext('./STREET'), 'address2': address.findtext('./NUMBER'), 'city': address.findtext('./CITY'), 'postal_code': address.findtext('./ZIPCODE'), 'country': normalize_country(address.findtext('./COUNTRY')) } record['addresses'].append(data) for birth in entry.findall('./BIRTH'): place = birth.findtext('./PLACE') if place and len(place) > len(record.get('place_of_birth', '')): record['place_of_birth'] = place date_ = parse_date(birth.findtext('./DATE')) if date_ and len(date_) > len(record.get('date_of_birth', '')): record['date_of_birth'] = date_ country = normalize_country(birth.findtext('./COUNTRY')) if country and len(country) > len(record.get('country_of_birth', '')): record['country_of_birth'] = country # print etree.tostring(entry, pretty_print=True) emit.entity(record)
def parse_entry(emit, record, entry): uid = entry.findtext('uid') record.update({ 'uid': make_id('us', 'ofac', uid), 'type': 'individual', 'program': entry.findtext('./programList/program'), 'summary': entry.findtext('./remarks'), 'first_name': entry.findtext('./firstName'), 'last_name': entry.findtext('./lastName'), 'name': combine_name(entry.findtext('./firstName'), entry.findtext('./lastName')) }) is_entity = entry.findtext('./sdnType') != 'Individual' if is_entity: record['type'] = 'entity' record.pop('last_name', None) record['other_names'] = [] for aka in entry.findall('./akaList/aka'): data = { 'type': aka.findtext('./type'), 'quality': aka.findtext('./category'), 'first_name': aka.findtext('./firstName'), 'last_name': aka.findtext('./lastName'), 'other_name': combine_name(aka.findtext('./firstName'), aka.findtext('./lastName')) } if is_entity: data.pop('last_name', None) record['other_names'].append(data) record['identities'] = [] for ident in entry.findall('./idList/id'): data = { 'type': ident.findtext('./idType'), 'number': ident.findtext('./idNumber'), 'country': normalize_country(ident.findtext('./idCountry')) } record['identities'].append(data) record['addresses'] = [] for address in entry.findall('./addressList/address'): data = { 'address1': address.findtext('./address1'), 'address2': address.findtext('./address2'), 'city': address.findtext('./city'), 'country': normalize_country(address.findtext('./country')) } record['addresses'].append(data) for pob in entry.findall('./placeOfBirthList/placeOfBirthItem'): if pob.findtext('./mainEntry') == 'true': record['place_of_birth'] = pob.findtext('./placeOfBirth') for pob in entry.findall('./dateOfBirthList/dateOfBirthItem'): if pob.findtext('./mainEntry') == 'true': dt = pob.findtext('./dateOfBirth') record['date_of_birth'] = parse_date(dt) # print etree.tostring(entry, pretty_print=True) if is_entity: record.pop('last_name', None) emit.entity(record)
def parse_entry(emit, group, rows): record = SOURCE.copy() record.update({ 'uid': make_id('gb', 'hmt', group), 'identities': [], 'addresses': [], 'other_names': [] }) for row in rows: record.update({ 'type': row.pop('Group Type').lower(), 'date_of_birth': parse_date(row.pop('DOB')), 'place_of_birth': row.pop('Town of Birth'), 'country_of_birth': normalize_country(row.pop('Country of Birth')), 'nationality': normalize_country(row.get('Nationality')), 'program': row.pop('Regime'), 'summary': row.pop('Other Information'), 'updated_at': parse_date(row.pop('Last Updated')), 'function': row.pop('Position') }) names = { 'first_name': row.get('Name 1'), 'second_name': row.get('Name 2'), 'middle_name': row.get('Name 3'), 'last_name': row.get('Name 6') } name = [ row.pop('Title'), row.pop('Name 1'), row.pop('Name 2'), row.pop('Name 3'), row.pop('Name 4'), row.pop('Name 5'), row.pop('Name 6') ] name = combine_name(*name) if 'name' not in record: record['name'] = name record.update(names) else: names['other_name'] = name names['type'] = row.pop('Alias Type') record['other_names'].append(names) addr = [ row.pop('Address 1'), row.pop('Address 2'), row.pop('Address 3'), row.pop('Address 4'), row.pop('Address 5'), row.pop('Address 6') ] addr = combine_name(*addr) if len(addr): record['addresses'].append({ 'text': addr, 'postal_code': row.pop('Post/Zip Code') }) if row.get('Passport Details'): record['identities'].append({ 'type': 'Passport', 'number': row.pop('Passport Details'), 'country': normalize_country(row.get('Nationality')) }) if row.get('NI Number'): record['identities'].append({ 'type': 'NI', 'number': row.pop('NI Number'), 'country': normalize_country(row.get('Country')) }) # from pprint import pprint # pprint(row) emit.entity(record)
def parse_entry(emit, group, rows): record = SOURCE.copy() record.update({ 'uid': make_id('gb', 'hmt', group), 'identities': [], 'addresses': [], 'other_names': [] }) for row in rows: record.update({ 'type': row.pop('Group Type').lower(), 'date_of_birth': parse_date(row.pop('DOB')), 'place_of_birth': row.pop('Town of Birth'), 'country_of_birth': normalize_country(row.pop('Country of Birth')), 'nationality': normalize_country(row.get('Nationality')), 'program': row.pop('Regime'), 'summary': row.pop('Other Information'), 'updated_at': parse_date(row.pop('Last Updated')), 'function': row.pop('Position') }) names = { 'first_name': row.get('Name 1'), 'second_name': row.get('Name 2'), 'middle_name': row.get('Name 3'), 'last_name': row.get('Name 6') } name = [row.pop('Title'), row.pop('Name 1'), row.pop('Name 2'), row.pop('Name 3'), row.pop('Name 4'), row.pop('Name 5'), row.pop('Name 6')] name = combine_name(*name) if 'name' not in record: record['name'] = name record.update(names) else: names['other_name'] = name names['type'] = row.pop('Alias Type') record['other_names'].append(names) addr = [row.pop('Address 1'), row.pop('Address 2'), row.pop('Address 3'), row.pop('Address 4'), row.pop('Address 5'), row.pop('Address 6')] addr = combine_name(*addr) if len(addr): record['addresses'].append({ 'text': addr, 'postal_code': row.pop('Post/Zip Code') }) if row.get('Passport Details'): record['identities'].append({ 'type': 'Passport', 'number': row.pop('Passport Details'), 'country': normalize_country(row.get('Nationality')) }) if row.get('NI Number'): record['identities'].append({ 'type': 'NI', 'number': row.pop('NI Number'), 'country': normalize_country(row.get('Country')) }) # from pprint import pprint # pprint(row) emit.entity(record)
def parse_entry(emit, entry): uid = entry.findtext('number-entry') record = { 'uid': make_id('ua', 'sdfm', uid), 'type': 'individual', 'publisher': 'State Financial Monitoring Service of Ukraine', 'publisher_url': 'http://www.sdfm.gov.ua/', 'source_id': 'UA-SDFM-SANC', 'source_url': 'http://www.sdfm.gov.ua/articles.php?cat_id=87&lang=en', 'program': entry.findtext('./program-entry'), 'summary': entry.findtext('./comments') } date_entry = entry.findtext('./date-entry') if date_entry: date_entry = datetime.strptime(date_entry, '%Y%m%d') record['updated_at'] = date_entry.date().isoformat() is_entity = entry.findtext('./type-entry') != '1' if is_entity: record['type'] = 'entity' record['other_names'] = [] for aka in entry.findall('./aka-list'): data = get_names(aka) if aka.findtext('type-aka') == 'N': record['name'] = data.pop('other_name', None) record.update(data) else: data['type'] = aka.findtext('./type-aka') data['quality'] = aka.findtext('./quality-aka') if data['quality'] == '1': data['quality'] = 'strong' if data['quality'] == '2': data['quality'] = 'weak' # if is_entity: # data.pop('last_name', None) record['other_names'].append(data) record['identities'] = [] for doc in entry.findall('./document-list'): data = { 'text': doc.findtext('./document-reg'), 'number': doc.findtext('./document-id'), 'country': normalize_country(doc.findtext('./document-country')) } record['identities'].append(data) for doc in entry.findall('./id-number-list'): data = {'text': doc.text.strip()} record['identities'].append(data) record['addresses'] = [] for address in entry.findall('./address-list'): data = { 'text': address.findtext('./address') } record['addresses'].append(data) # FIXME: handle multiple for pob in entry.findall('./place-of-birth-list'): if 'place_of_birth' in record: log.debug('Multiple places of birth: %(name)s', record) record['place_of_birth'] = pob.text.strip() # FIXME: handle multiple for dob in entry.findall('./date-of-birth-list'): if 'place_of_birth' in record: log.debug('Multiple dates of birth: %(name)s', record) record['date_of_birth'] = parse_date(dob.text) # print etree.tostring(entry, pretty_print=True) emit.entity(record)