def parse_individual(record, ind): last_name = ind.findtext('.//THIRD_NAME') second_name = ind.findtext('.//SECOND_NAME') if last_name is None: last_name = second_name second_name = None record.update({ 'first_name': ind.findtext('.//FIRST_NAME'), 'second_name': second_name, 'last_name': last_name }) for alias in ind.findall('./INDIVIDUAL_ALIAS'): parse_alias(alias, record) for addr in ind.findall('./INDIVIDUAL_ADDRESS'): parse_address(addr, record) for ident in ind.findall('./INDIVIDUAL_DOCUMENT'): country = normalize_country(ident.findtext('./COUNTRY_OF_ISSUE')) number = ident.findtext('./NUMBER') if number is None and country is None: continue record['identities'].append({ 'type': ident.findtext('./TYPE_OF_DOCUMENT'), 'number': number, 'country': country, }) for dob in ind.findall('./INDIVIDUAL_DATE_OF_BIRTH'): date = dob.findtext('./DATE') if date is None: continue if ':' in date: date = date.rsplit('-', 1)[0] approx = dob.findtext('./TYPE_OF_DATE') == 'Approximately' if approx and 'date_of_birth' in record: continue record['date_of_birth'] = date for pob in ind.findall('./INDIVIDUAL_PLACE_OF_BIRTH'): country = pob.findtext('./COUNTRY') record['country_of_birth'] = normalize_country(country) place = pob.findtext('./CITY') region = pob.findtext('./STATE_PROVINCE') if place and region: record['place_of_birth'] = '%s (%s)' % (place, region) if place: record['place_of_birth'] = place if region: record['place_of_birth'] = region source.emit(record)
def parse_case(case): url = case.get('url') name = combine_name(*reversed(case.get('name').split(', '))) updated = dateutil_parse(case.get('last_updated')) record = { 'uid': make_id('interpol', url.split('/')[-1]), 'source_url': url, 'name': name, 'summary': case.get('reason'), 'updated_at': updated.date().isoformat(), 'place_of_birth': case.get('place_of_birth'), 'gender': case.get('sex', '').lower(), 'first_name': case.get('forename'), 'last_name': case.get('present_family_name'), 'nationality': normalize_country(case.get('nationality')), 'identities': [], 'addresses': [], 'other_names': [] } record.update(SOURCE) birth = case.get('date_of_birth').split(' ')[0] try: dt = datetime.strptime(birth, '%Y').date().isoformat() record['date_of_birth'] = dt except Exception: try: dt = datetime.strptime(birth, '%d/%m/%Y').date().isoformat() record['date_of_birth'] = dt except Exception as ex: log.exception(ex) source.emit(record)
def parse_common(node, type_): program_ref = '%s (%s)' % (node.findtext('./UN_LIST_TYPE').strip(), node.findtext('./REFERENCE_NUMBER').strip()) record = { 'uid': make_id('un', 'sc', node.findtext('./DATAID')), 'type': type_, 'program': program_ref, 'summary': node.findtext('./COMMENTS1'), 'name': combine_name(node.findtext('./FIRST_NAME'), node.findtext('./SECOND_NAME'), node.findtext('./THIRD_NAME')), 'function': node.findtext('./DESIGNATION/VALUE'), 'updated_at': node.findtext('./LISTED_ON'), 'nationality': normalize_country(node.findtext('./NATIONALITY/VALUE')), 'other_names': [], 'addresses': [], 'identities': [] } record.update(BASE) orig = node.findtext('./NAME_ORIGINAL_SCRIPT') if orig is not None: record['name'] = orig last_updated = node.findtext('./LAST_DAY_UPDATED/VALUE') if last_updated is not None: record['updated_at'] = last_updated if ':' in record['updated_at']: record['updated_at'] = record['updated_at'].rsplit('-', 1)[0] # print etree.tostring(node, pretty_print=True) return record
def parse_row(row): record = SOURCE.copy() record.update({ 'uid': make_id('us', 'bis', row.get('Effective_Date'), row.get('Name')), 'name': row.get('Name'), 'program': row.get('FR_Citation'), 'summary': row.get('Action'), 'updated_at': row.get('Last Update'), 'nationality': normalize_country(row.get('Country')), 'addresses': [{ 'address1': row.get('Street_Address'), 'postal_code': row.get('Postal_Code'), 'region': row.get('State'), 'city': row.get('City'), 'country': normalize_country(row.get('Country')) }] }) source.emit(record)
def wbdeb_parse(html_file): doc = html.parse(html_file) for table in doc.findall('//table'): if 'List of Debarred' not in table.get('summary', ''): continue rows = table.findall('.//tr') print table.get('summary'), len(rows) for row in rows: tds = row.findall('./td') if len(tds) != 6: continue values = [clean_value(td) for td in tds] uid = sha1() for value in values: uid.update(value.encode('utf-8')) uid = uid.hexdigest()[:10] names = clean_name(values[0]) if not len(names): log.warning("No name: %r", values) continue record = { 'uid': make_id('wb', 'debarred', uid), 'name': names[0], 'nationality': normalize_country(values[2]), 'program': values[5], 'addresses': [{ 'text': values[1], 'country': normalize_country(values[2]) }], 'other_names': [], 'updated_at': dateutil_parse(values[3]).date().isoformat() } for name in names[1:]: record['other_names'].append({ 'other_name': name }) record.update(SOURCE) source.emit(record)
def parse_address(addr, record): data = { 'text': addr.findtext('./NOTE'), 'address1': addr.findtext('./STREET'), 'city': addr.findtext('./CITY'), 'region': addr.findtext('./STATE_PROVINCE'), 'country': normalize_country(addr.findtext('./COUNTRY')), } exist = set(data.values()) if exist == 1 and exist[0] is None: return record['addresses'].append(data)
def parse_politician(source, country, legislature, data): # from pprint import pprint # pprint(policitian) # TODO: add politician code = normalize_country(country.get('code')) entity = { 'uid': make_id('evpo', data.get('id').split('-')[-1]), 'name': data.get('name'), 'type': 'individual', 'addresses': [{'country': code}], 'updated_at': parse_ts(legislature.get('lastmod')), 'source_url': data.get('source_url'), 'source': '%s (%s)' % (legislature['name'], country['name']) } entity.update(PUBLISHER) source.emit(entity)
def worldleaders_parse(json_file): with open(json_file, 'r') as fh: data = json.load(fh) for leader in data.get('leaders'): if not len(leader.get('name')): log.warning('No name on entity: %(title)s (%(country)s)', leader) continue country = normalize_country(leader.get('country')) summary = leader.get('title') if leader.get('component'): summary = '%s (%s)' % (summary, leader.get('component')) entity = { 'uid': make_id('us', 'cia', 'worldleaders', country, leader.get('name')), 'name': leader.get('name'), 'type': 'individual', 'summary': summary, 'addresses': [{'country': country}], 'updated_at': parse_date(leader.get('last_update')), 'source_url': leader.get('url') } entity.update(PUBLISHER) source.emit(entity)
def parse_entry(entry): uid = entry.findtext('number-entry') record = { 'uid': make_id('ua', 'sdfm', uid), 'type': 'individual', 'publisher': 'Ukraine SDFM', 'publisher_url': 'http://www.sdfm.gov.ua/', 'source': 'Blacklist', 'source_id': 'UA-SDFM-SANC', 'source_url': 'http://www.sdfm.gov.ua/articles.php?cat_id=87&lang=en', 'program': entry.findtext('./program-entry'), 'summary': entry.findtext('./comments') } date_entry = entry.findtext('./date-entry') if date_entry: date_entry = datetime.strptime(date_entry, '%Y%m%d') record['updated_at'] = date_entry.date().isoformat() is_entity = entry.findtext('./type-entry') != '1' if is_entity: record['type'] = 'entity' record['other_names'] = [] for aka in entry.findall('./aka-list'): data = get_names(aka) if aka.findtext('type-aka') == 'N': record['name'] = data.pop('other_name', None) record.update(data) else: data['type'] = aka.findtext('./type-aka') data['quality'] = aka.findtext('./quality-aka') if data['quality'] == '1': data['quality'] = 'strong' if data['quality'] == '2': data['quality'] = 'weak' # if is_entity: # data.pop('last_name', None) record['other_names'].append(data) record['identities'] = [] for doc in entry.findall('./document-list'): data = { 'text': doc.findtext('./document-reg'), 'number': doc.findtext('./document-id'), 'country': normalize_country(doc.findtext('./document-country')) } record['identities'].append(data) for doc in entry.findall('./id-number-list'): data = {'text': doc.text.strip()} record['identities'].append(data) record['addresses'] = [] for address in entry.findall('./address-list'): data = { 'text': address.findtext('./address') } record['addresses'].append(data) # FIXME: handle multiple for pob in entry.findall('./place-of-birth-list'): if 'place_of_birth' in record: log.debug('Multiple places of birth: %(name)s', record) record['place_of_birth'] = pob.text.strip() # FIXME: handle multiple for dob in entry.findall('./date-of-birth-list'): if 'place_of_birth' in record: log.debug('Multiple dates of birth: %(name)s', record) record['date_of_birth'] = parse_date(dob.text) # print etree.tostring(entry, pretty_print=True) source.emit(record)
def parse_entry(source, record, entry): uid = entry.findtext('uid') record.update({ 'uid': make_id('us', 'ofac', uid), 'type': 'individual', 'program': entry.findtext('./programList/program'), 'summary': entry.findtext('./remarks'), 'first_name': entry.findtext('./firstName'), 'last_name': entry.findtext('./lastName'), 'name': combine_name(entry.findtext('./firstName'), entry.findtext('./lastName')) }) is_entity = entry.findtext('./sdnType') != 'Individual' if is_entity: record['type'] = 'entity' record.pop('last_name', None) record['other_names'] = [] for aka in entry.findall('./akaList/aka'): data = { 'type': aka.findtext('./type'), 'quality': aka.findtext('./category'), 'first_name': aka.findtext('./firstName'), 'last_name': aka.findtext('./lastName'), 'other_name': combine_name(aka.findtext('./firstName'), aka.findtext('./lastName')) } if is_entity: data.pop('last_name', None) record['other_names'].append(data) record['identities'] = [] for ident in entry.findall('./idList/id'): data = { 'type': ident.findtext('./idType'), 'number': ident.findtext('./idNumber'), 'country': normalize_country(ident.findtext('./idCountry')) } record['identities'].append(data) record['addresses'] = [] for address in entry.findall('./addressList/address'): data = { 'address1': address.findtext('./address1'), 'address2': address.findtext('./address2'), 'city': address.findtext('./city'), 'country': normalize_country(address.findtext('./country')) } record['addresses'].append(data) for pob in entry.findall('./placeOfBirthList/placeOfBirthItem'): if pob.findtext('./mainEntry') == 'true': record['place_of_birth'] = pob.findtext('./placeOfBirth') for pob in entry.findall('./dateOfBirthList/dateOfBirthItem'): if pob.findtext('./mainEntry') == 'true': dt = pob.findtext('./dateOfBirth') record['date_of_birth'] = parse_date(dt) # print etree.tostring(entry, pretty_print=True) if is_entity: record.pop('last_name', None) source.emit(record)
def parse_entry(group, rows): record = SOURCE.copy() record.update({"uid": make_id("gb", "hmt", group), "identities": [], "addresses": [], "other_names": []}) for row in rows: record.update( { "type": row.pop("Group Type").lower(), "date_of_birth": parse_date(row.pop("DOB")), "place_of_birth": row.pop("Town of Birth"), "country_of_birth": normalize_country(row.pop("Country of Birth")), "nationality": normalize_country(row.get("Nationality")), "program": row.pop("Regime"), "summary": row.pop("Other Information"), "updated_at": parse_date(row.pop("Last Updated")), "function": row.pop("Position"), } ) names = { "first_name": row.get("Name 1"), "second_name": row.get("Name 2"), "middle_name": row.get("Name 3"), "last_name": row.get("Name 6"), } name = [ row.pop("Title"), row.pop("Name 1"), row.pop("Name 2"), row.pop("Name 3"), row.pop("Name 4"), row.pop("Name 5"), row.pop("Name 6"), ] name = combine_name(*name) if "name" not in record: record["name"] = name record.update(names) else: names["other_name"] = name names["type"] = row.pop("Alias Type") record["other_names"].append(names) addr = [ row.pop("Address 1"), row.pop("Address 2"), row.pop("Address 3"), row.pop("Address 4"), row.pop("Address 5"), row.pop("Address 6"), ] addr = combine_name(*addr) if len(addr): record["addresses"].append({"text": addr, "postal_code": row.pop("Post/Zip Code")}) if row.get("Passport Details"): record["identities"].append( { "type": "Passport", "number": row.pop("Passport Details"), "country": normalize_country(row.get("Nationality")), } ) if row.get("NI Number"): record["identities"].append( {"type": "NI", "number": row.pop("NI Number"), "country": normalize_country(row.get("Country"))} ) # from pprint import pprint # pprint(row) source.emit(record)
def parse_entry(record, entry): uid = entry.get('Id') record.update({ 'uid': make_id('eu', 'eeas', uid), 'type': 'individual', 'updated_at': entry.get('reg_date'), 'source_url': entry.get('pdf_link'), 'program': entry.get('programme'), 'summary': entry.get('remark') }) is_entity = entry.get('Type') != 'P' if is_entity: record['type'] = 'entity' record['other_names'] = [] for aka in entry.findall('./NAME'): data = { 'first_name': aka.findtext('./FIRSTNAME'), 'last_name': aka.findtext('./LASTNAME'), 'middle_name': aka.findtext('./MIDDLENAME'), 'other_name': aka.findtext('./WHOLENAME') } funct = aka.findtext('./FUNCTION') if funct and len(funct) > len(record.get('function', '')): record['function'] = funct gender = aka.findtext('./GENDER') if gender == 'M': data['gender'] = 'male' if gender == 'F': data['gender'] = 'female' if 'name' not in record: record['name'] = data.pop('other_name') record.update(data) else: record['other_names'].append(data) record['identities'] = [] for passport in entry.findall('./PASSPORT'): data = { 'type': 'Passport', 'number': passport.findtext('./NUMBER'), 'country': normalize_country(passport.findtext('./COUNTRY')) } record['identities'].append(data) record['addresses'] = [] for address in entry.findall('./ADDRESS'): data = { 'address1': address.findtext('./STREET'), 'address2': address.findtext('./NUMBER'), 'city': address.findtext('./CITY'), 'postal_code': address.findtext('./ZIPCODE'), 'country': normalize_country(address.findtext('./COUNTRY')) } record['addresses'].append(data) for birth in entry.findall('./BIRTH'): place = birth.findtext('./PLACE') if place and len(place) > len(record.get('place_of_birth', '')): record['place_of_birth'] = place date_ = parse_date(birth.findtext('./DATE')) if date_ and len(date_) > len(record.get('date_of_birth', '')): record['date_of_birth'] = date_ country = normalize_country(birth.findtext('./COUNTRY')) if country and len(country) > len(record.get('country_of_birth', '')): record['country_of_birth'] = country # print etree.tostring(entry, pretty_print=True) source.emit(record)