def parse_name(entity, node): name_prop = NAME_TYPE[node.get("name-type")] is_weak = NAME_QUALITY_WEAK[node.get("quality")] parts = defaultdict(dict) for part in node.findall("./name-part"): part_type = part.get("name-part-type") value = part.findtext("./value") parts[None][part_type] = value for spelling in part.findall("./spelling-variant"): key = (spelling.get("lang"), spelling.get("script")) parts[key][part_type] = spelling.text for key, parts in parts.items(): entity.add("title", parts.pop("title", None), quiet=True) entity.add("title", parts.pop("suffix", None), quiet=True) entity.add("weakAlias", parts.pop("other", None), quiet=True) entity.add("weakAlias", parts.pop("tribal-name", None), quiet=True) entity.add("fatherName", parts.pop("grand-father-name", None), quiet=True) h.apply_name( entity, full=parts.pop("whole-name", None), given_name=parts.pop("given-name", None), second_name=parts.pop("further-given-name", None), patronymic=parts.pop("father-name", None), last_name=parts.pop("family-name", None), maiden_name=parts.pop("maiden-name", None), is_weak=is_weak, name_prop=name_prop, quiet=True, ) h.audit_data(parts)
def crawl_common(context: Context, data: Dict[str, str], part: str, schema: str): entity = context.make(schema) entity.id = context.make_slug(part, data.pop("DATAID")) entity.add("topics", "sanction") entity.add("notes", h.clean_note(data.pop("COMMENTS1"))) entity.add("notes", h.clean_note(data.pop("NOTE", None))) entity.add("alias", data.pop("NAME_ORIGINAL_SCRIPT")) h.apply_name( entity, name1=data.pop("FIRST_NAME", None), name2=data.pop("SECOND_NAME", None), name3=data.pop("THIRD_NAME", None), name4=data.pop("FOURTH_NAME", None), quiet=True, ) sanction = h.make_sanction(context, entity) submitted_on = parse_date(data.pop("SUBMITTED_ON", None)) listed_on = parse_date(data.pop("LISTED_ON")) modified_at = parse_date(data.pop("LAST_DAY_UPDATED")) entity.add("createdAt", submitted_on or listed_on or modified_at) entity.add("modifiedAt", modified_at) sanction.add("listingDate", submitted_on or listed_on) sanction.add("startDate", listed_on) sanction.add("program", data.pop("UN_LIST_TYPE")) sanction.add("program", data.pop("LIST_TYPE")) sanction.add("unscId", data.pop("REFERENCE_NUMBER")) sanction.add("authority", data.pop("SUBMITTED_BY", None)) context.emit(sanction) return entity
def parse_alias(party, parts, alias): # primary = as_bool(alias.get("Primary")) is_weak = as_bool(alias.get("LowQuality")) alias_type = ref_value("AliasType", alias.get("AliasTypeID")) name_prop = ALIAS_TYPES[alias_type] for name in alias.findall("./DocumentedName"): names = defaultdict(lambda: "") for value in name.findall("./DocumentedNamePart/NamePartValue"): type_ = parts.get(value.get("NamePartGroupID")) names[type_] = " ".join([names[type_], value.text]).strip() h.apply_name( party, full=names.pop("Entity Name", None), name_prop=name_prop, is_weak=is_weak, ) party.add("name", names.pop("Vessel Name", None)) party.add("weakAlias", names.pop("Nickname", None)) party.add("registrationNumber", names.pop("Aircraft Name", None)) h.apply_name( party, first_name=names.pop("First Name", None), middle_name=names.pop("Middle Name", None), maiden_name=names.pop("Maiden Name", None), last_name=names.pop("Last Name", None), matronymic=names.pop("Matronymic", None), patronymic=names.pop("Patronymic", None), is_weak=is_weak, name_prop=name_prop, ) h.audit_data(names)
def parse_alias(entity: Entity, alias: Dict[str, str]): name_prop = NAME_QUALITY[alias.pop("QUALITY", None)] h.apply_name( entity, full=alias.pop("ALIAS_NAME", None), quiet=True, name_prop=name_prop, ) h.audit_data(alias, ignore=["NOTE"])
def crawl(context: Context): path = context.fetch_resource("source.json", context.dataset.data.url) context.export_resource(path, JSON, title=context.SOURCE_TITLE) with open(path, "r") as fh: data = json.load(fh) for record in data: bank = context.make("Company") charter_no = record.pop("CharterNumber") bank_name = record.pop("BankName") bank.id = context.make_slug(charter_no, bank_name) bank.add("name", bank_name) bank.add("registrationNumber", charter_no) bank.add("country", "us") bank.add("topics", "fin.bank") if bank.id is not None: context.emit(bank) company_name = record.pop("CompanyName") first_name = record.pop("FirstName") last_name = record.pop("LastName") if company_name: entity = context.make("Company") entity.id = context.make_id(charter_no, bank_name, company_name) entity.add("name", company_name) else: entity = context.make("Person") entity.id = context.make_id(charter_no, bank_name, first_name, last_name) h.apply_name(entity, first_name=first_name, last_name=last_name) entity.add("country", "us") entity.add("topics", "crime.fin") addr = h.make_address( context, city=record.pop("CityName"), state=record.pop("StateName"), country_code="us", ) record.pop("StateAbbreviation") h.apply_address(context, entity, addr) sanction = h.make_sanction(context, entity) sanction.add("startDate", record.pop("CompleteDate", None)) sanction.add("endDate", record.pop("TerminationDate", None)) sanction.add("program", record.pop("EnforcementTypeDescription", None)) sanction.add("authorityId", record.pop("DocketNumber", None)) # context.pprint(record) context.emit(entity, target=True) context.emit(sanction)
def parse_person(context: Context, node): entity = context.make("Person") h.apply_name( entity, given_name=node.findtext("./Name"), patronymic=node.findtext("./Patronomic"), last_name=node.findtext("./Surname"), ) entity.id = context.make_id( node.tag, node.findtext("./Number"), node.findtext("./Name"), node.findtext("./Patronomic"), node.findtext("./Surname"), ) entity.add("birthDate", h.parse_date(node.findtext("./DataBirth"), FORMATS)) entity.add("birthPlace", node.findtext("./PlaceBirth")) parse_common(context, node, entity)
def parse_common(context: Context, entity, node): entity.id = context.make_slug(node.findtext("./DATAID")) h.apply_name( entity, given_name=node.findtext("./FIRST_NAME"), second_name=node.findtext("./SECOND_NAME"), name3=node.findtext("./THIRD_NAME"), name4=node.findtext("./FOURTH_NAME"), quiet=True, ) entity.add("alias", node.findtext("./NAME_ORIGINAL_SCRIPT")) entity.add("notes", h.clean_note(node.findtext("./COMMENTS1"))) entity.add("topics", "sanction") sanction = h.make_sanction(context, entity) entity.add("createdAt", node.findtext("./LISTED_ON")) sanction.add("listingDate", node.findtext("./LISTED_ON")) sanction.add("startDate", node.findtext("./LISTED_ON")) sanction.add("modifiedAt", values(node.find("./LAST_DAY_UPDATED"))) entity.add("modifiedAt", values(node.find("./LAST_DAY_UPDATED"))) sanction.add("program", node.findtext("./UN_LIST_TYPE")) sanction.add("unscId", node.findtext("./REFERENCE_NUMBER")) return sanction
def crawl(context: Context): path = context.fetch_resource("source.xml", context.dataset.data.url) context.export_resource(path, XML, title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) for el in doc.findall(".//person"): fname = el.findtext("./fname") mname = el.findtext("./mname") lname = el.findtext("./lname") bdate = el.findtext("./birthdate") iin = el.findtext("./iin") name = h.make_name(given_name=fname, middle_name=mname, last_name=lname) entity_id = context.make_id(name, bdate, iin) entity = make_entity(context, el, "Person", entity_id) h.apply_name(entity, given_name=fname, middle_name=mname, last_name=lname) entity.add("innCode", iin) entity.add("birthDate", h.parse_date(bdate, FORMATS, bdate)) context.emit(entity, target=True) for el in doc.findall(".//org"): name = el.findtext(".//org_name") entity_id = context.make_id(el.findtext("./num"), name) entity = make_entity(context, el, "Organization", entity_id) for tag in (".//org_name", ".//org_name_en"): names = el.findtext(tag) if names is None: continue names = names.split("; ") entity.add("name", names) context.emit(entity, target=True)
def crawl_entity(context: Context, data): # context.pprint(data) nature = data.pop("Nature") schema = SCHEMATA.get(nature) if schema is None: context.log.error("Unknown entity type", nature=nature) return entity = context.make(schema) entity.id = context.make_slug(data.pop("IdRegistre")) sanction = h.make_sanction(context, entity) for detail in data.pop("RegistreDetail"): field = detail.pop("TypeChamp") for value in detail.pop("Valeur"): apply_prop(context, entity, sanction, field, value) name = data.pop("Nom") h.apply_name( entity, first_name=entity.first("firstName", quiet=True), tail_name=name, quiet=True, ) entity.add("topics", "sanction") context.emit(entity, target=True)
def parse_entry(context: Context, entry): entity = context.make("LegalEntity") if entry.findtext("./type-entry") == "2": entity = context.make("Person") entry_id = entry.findtext("number-entry") entity.id = context.make_slug(entry_id) sanction = h.make_sanction(context, entity) sanction.add("program", entry.findtext("./program-entry")) date_entry = entry.findtext("./date-entry") if date_entry: date = datetime.strptime(date_entry, "%Y%m%d") entity.add("createdAt", date.date()) sanction.add("listingDate", date.date()) sanction.add("startDate", date.date()) for aka in entry.findall("./aka-list"): h.apply_name( entity, name1=aka.findtext("./aka-name1"), name2=aka.findtext("./aka-name2"), name3=aka.findtext("./aka-name3"), tail_name=aka.findtext("./aka-name4"), alias=aka.findtext("type-aka") != "N", is_weak=aka.findtext("./quality-aka") == "2", quiet=True, ) for node in entry.findall("./title-list"): entity.add("title", node.text, quiet=True) for doc in entry.findall("./document-list"): reg = doc.findtext("./document-reg") number = doc.findtext("./document-id") country = doc.findtext("./document-country") passport = context.make("Passport") passport.id = context.make_id("Passport", entity.id, reg, number, country) passport.add("holder", entity) passport.add("passportNumber", number) passport.add("summary", reg) passport.add("country", country) context.emit(passport) for doc in entry.findall("./id-number-list"): entity.add("idNumber", doc.text) for node in entry.findall("./address-list"): address = h.make_address(context, full=node.findtext("./address")) h.apply_address(context, entity, address) for pob in entry.findall("./place-of-birth-list"): entity.add_cast("Person", "birthPlace", pob.text) for dob in entry.findall("./date-of-birth-list"): date = parse_date(dob.text) entity.add_cast("Person", "birthDate", date) for nat in entry.findall("./nationality-list"): for country in multi_split(nat.text, [";", ","]): country = remove_bracketed(country) entity.add("nationality", country, quiet=True) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def parse_entry(context: Context, entry: Element): subject_type = entry.find("./subjectType") schema = context.lookup_value( "subject_type", subject_type.get("code"), dataset="eu_fsf", ) if schema is None: context.log.warning("Unknown subject type", type=subject_type) return entity = context.make(schema) eu_ref = entry.get("euReferenceNumber") if eu_ref is not None: entity.id = context.make_slug(eu_ref, dataset="eu_fsf") else: entity.id = context.make_slug("logical", entry.get("logicalId")) entity.add("notes", h.clean_note(entry.findtext("./remark"))) entity.add("topics", "sanction") parse_sanctions(context, entity, entry) for name in entry.findall("./nameAlias"): is_weak = not as_bool(name.get("strong")) h.apply_name( entity, full=name.get("wholeName"), first_name=name.get("firstName"), middle_name=name.get("middleName"), last_name=name.get("lastName"), is_weak=is_weak, quiet=True, ) entity.add("title", name.get("title"), quiet=True) entity.add("position", name.get("function"), quiet=True) entity.add("gender", name.get("gender"), quiet=True) for node in entry.findall("./identification"): type = node.get("identificationTypeCode") schema = "Passport" if type == "passport" else "Identification" passport = context.make(schema) passport.id = context.make_id("ID", entity.id, node.get("logicalId")) passport.add("holder", entity) passport.add("authority", node.get("issuedBy")) passport.add("type", node.get("identificationTypeDescription")) passport.add("number", node.get("number")) passport.add("number", node.get("latinNumber")) passport.add("startDate", node.get("issueDate")) passport.add("startDate", node.get("issueDate")) passport.add("country", parse_country(node)) passport.add("country", node.get("countryDescription")) for remark in node.findall("./remark"): passport.add("summary", remark.text) context.emit(passport) for node in entry.findall("./address"): address = parse_address(context, node) h.apply_address(context, entity, address) for child in node.getchildren(): if child.tag in ("regulationSummary"): continue elif child.tag == "remark": entity.add("notes", child.text) elif child.tag == "contactInfo": prop = context.lookup_value( "contact_info", child.get("key"), dataset="eu_fsf", ) if prop is None: context.log.warning("Unknown contact info", node=child) else: entity.add(prop, child.get("value")) else: context.log.warning("Unknown address component", node=child) for birth in entry.findall("./birthdate"): partialBirth = parse_parts(birth.get("year"), birth.get("month"), birth.get("day")) entity.add("birthDate", birth.get("birthdate")) entity.add("birthDate", partialBirth) address = parse_address(context, birth) if address is not None: entity.add("birthPlace", address.get("full")) entity.add("country", address.get("country")) for node in entry.findall("./citizenship"): entity.add("nationality", parse_country(node), quiet=True) entity.add("nationality", node.get("countryDescription"), quiet=True) context.emit(entity, target=True)
def crawl_row(context: Context, data: Dict[str, str]): entity = context.make("LegalEntity") ind_id = data.pop("INDIVIDUAL_Id", data.pop("IndividualID")) entity.id = context.make_slug(ind_id) assert entity.id, data entity.add("notes", h.clean_note(data.pop("COMMENTS", None))) entity.add("notes", h.clean_note(data.pop("Comments", None))) entity.add("notes", h.clean_note(data.pop("NOTE", None))) entity.add("notes", h.clean_note(data.pop("NOTE1", None))) entity.add("notes", h.clean_note(data.pop("NOTE2", None))) entity.add("notes", h.clean_note(data.pop("NOTE3", None))) entity.add_cast("Person", "nationality", data.pop("NATIONALITY", None)) entity.add_cast("Person", "nationality", data.pop("Nationality", None)) entity.add_cast("Person", "title", data.pop("TITLE", None)) entity.add_cast("Person", "title", data.pop("Title", None)) entity.add_cast("Person", "position", data.pop("DESIGNATION", None)) entity.add_cast("Person", "position", data.pop("Designation", None)) entity.add_cast("Person", "birthPlace", data.pop("PLACEOFBIRTH", None)) entity.add_cast("Person", "birthPlace", data.pop("IndividualPlaceOfBirth", None)) entity.add_cast("Person", "birthPlace", data.pop("CITY_OF_BIRTH", None)) entity.add_cast("Person", "birthDate", data.pop("YEAR", None)) entity.add_cast("Person", "gender", data.pop("GENDER", None)) entity.add_cast("Person", "birthDate", parse_date(data.pop("DATE", None))) entity.add_cast("Person", "birthDate", parse_date(data.pop("DATE_OF_BIRTH", None))) dob = parse_date(data.pop("IndividualDateOfBirth", None)) entity.add_cast("Person", "birthDate", dob) data.pop("BIRTHPLACE_x0020_CITY", None) data.pop("BIRTHPLACE_x0020_STATE_PROVINCE", None) entity.add("country", data.pop("BIRTHPLACE_x0020_COUNTRY", None)) entity.add("country", data.pop("COUNTRY_OF_BIRTH", None)) entity.add_cast("Person", "birthPlace", data.pop("BIRTHPLACE_x0020_NOTE", None)) h.apply_name( entity, full=data.pop("FullName", None), given_name=data.pop("FIRST_NAME", None), second_name=data.pop("SECOND_NAME", None), name3=data.pop("THIRD_NAME", None), name4=data.pop("FOURTH_NAME", None), quiet=True, ) alias = data.pop("NAME_ORIGINAL_SCRIPT", None) if alias is not None and "?" not in alias: entity.add("alias", alias) entity.add("alias", data.pop("SORT_KEY", None)) data.pop("IndividualAlias", None) entity.add_cast("Person", "passportNumber", data.pop("PASSPORT", None)) entity.add_cast("Person", "passportNumber", data.pop("IndividualDocument", None)) data.pop("DATE_OF_ISSUE", None) data.pop("CITY_OF_ISSUE", None) entity.add("country", data.pop("COUNTRY_OF_ISSUE", None)) entity.add_cast("Person", "idNumber", data.pop("IDNUMBER", None)) address = h.make_address( context, # remarks=data.pop("NOTE"), full=data.pop("IndividualAddress", None), street=data.pop("STREET", None), city=data.pop("CITY", None), region=data.pop("STATE_PROVINCE", None), postal_code=data.pop("ZIP_CODE", None), country=data.pop("COUNTRY", None), ) h.apply_address(context, entity, address) sanction = h.make_sanction(context, entity) inserted_at = parse_date(data.pop("DateInserted", None)) listed_on = data.pop("ListedON", data.pop("ListedOn", None)) listed_at = parse_date(listed_on) entity.add("createdAt", inserted_at or listed_at) sanction.add("listingDate", listed_at or inserted_at) sanction.add("startDate", data.pop("FROM_YEAR", None)) sanction.add("endDate", data.pop("TO_YEAR", None)) sanction.add("program", data.pop("UN_LIST_TYPE", None)) sanction.add("unscId", data.pop("REFERENCE_NUMBER", None)) sanction.add("unscId", data.pop("ReferenceNumber", None)) sanction.add("authority", data.pop("SUBMITTED_BY", None)) entity.add("topics", "sanction") h.audit_data(data, ignore=["VERSIONNUM", "TYPE_OF_DATE", "ApplicationStatus"]) context.emit(entity, target=True) context.emit(sanction)
def parse_row(context: Context, row): group_type = row.pop("GroupTypeDescription") schema = TYPES.get(group_type) if schema is None: context.log.error("Unknown group type", group_type=group_type) return entity = context.make(schema) entity.id = context.make_slug(row.pop("GroupID")) sanction = h.make_sanction(context, entity) sanction.add("program", row.pop("RegimeName")) sanction.add("authority", row.pop("ListingType", None)) listed_date = h.parse_date(row.pop("DateListed"), FORMATS) sanction.add("listingDate", listed_date) designated_date = h.parse_date(row.pop("DateDesignated"), FORMATS) sanction.add("startDate", designated_date) entity.add("createdAt", listed_date) if not entity.has("createdAt"): entity.add("createdAt", designated_date) sanction.add("authorityId", row.pop("UKSanctionsListRef", None)) sanction.add("unscId", row.pop("UNRef", None)) sanction.add("status", row.pop("GroupStatus", None)) sanction.add("reason", row.pop("UKStatementOfReasons", None)) last_updated = h.parse_date(row.pop("LastUpdated"), FORMATS) sanction.add("modifiedAt", last_updated) entity.add("modifiedAt", last_updated) # TODO: derive topics and schema from this?? entity_type = row.pop("Entity_Type", None) entity.add_cast("LegalEntity", "legalForm", entity_type) reg_number = row.pop("Entity_BusinessRegNumber", None) entity.add_cast("LegalEntity", "registrationNumber", reg_number) row.pop("Ship_Length", None) entity.add_cast("Vessel", "flag", row.pop("Ship_Flag", None)) flags = split_new(row.pop("Ship_PreviousFlags", None)) entity.add_cast("Vessel", "pastFlags", flags) entity.add_cast("Vessel", "type", row.pop("Ship_Type", None)) entity.add_cast("Vessel", "tonnage", row.pop("Ship_Tonnage", None)) entity.add_cast("Vessel", "buildDate", row.pop("Ship_YearBuilt", None)) entity.add_cast("Vessel", "imoNumber", row.pop("Ship_IMONumber", None)) ship_owner = row.pop("Ship_CurrentOwners", None) if ship_owner is not None: owner = context.make("LegalEntity") owner.id = context.make_slug("named", ship_owner) owner.add("name", ship_owner) context.emit(owner) ownership = context.make("Ownership") ownership.id = context.make_id(entity.id, "owns", owner.id) ownership.add("owner", owner) ownership.add("asset", entity) context.emit(ownership) countries = parse_countries(row.pop("Country", None)) entity.add("country", countries) title = split_items(row.pop("Title", None)) entity.add("title", title, quiet=True) pobs = split_items(row.pop("Individual_TownOfBirth", None)) entity.add_cast("Person", "birthPlace", pobs) dob = h.parse_date(row.pop("Individual_DateOfBirth", None), FORMATS) entity.add_cast("Person", "birthDate", dob) cob = parse_countries(row.pop("Individual_CountryOfBirth", None)) entity.add_cast("Person", "country", cob) nationalities = parse_countries(row.pop("Individual_Nationality", None)) entity.add_cast("Person", "nationality", nationalities) positions = split_items(row.pop("Individual_Position", None)) entity.add_cast("Person", "position", positions) entity.add_cast("Person", "gender", row.pop("Individual_Gender", None)) name_type = row.pop("AliasType", None) name_prop = NAME_TYPES.get(name_type) if name_prop is None: context.log.warning("Unknown name type", type=name_type) return name_quality = row.pop("AliasQuality", None) is_weak = WEAK_QUALITY.get(name_quality) if is_weak is None: context.log.warning("Unknown name quality", quality=name_quality) return h.apply_name( entity, name1=row.pop("name1", None), name2=row.pop("name2", None), name3=row.pop("name3", None), name4=row.pop("name4", None), name5=row.pop("name5", None), tail_name=row.pop("Name6", None), name_prop=name_prop, is_weak=is_weak, quiet=True, ) entity.add("alias", row.pop("NameNonLatinScript", None)) full_address = join_text( row.pop("Address1", None), row.pop("Address2", None), row.pop("Address3", None), row.pop("Address4", None), row.pop("Address5", None), row.pop("Address6", None), sep=", ", ) address = h.make_address( context, full=full_address, postal_code=row.pop("PostCode", None), country=first(countries), ) h.apply_address(context, entity, address) passport_number = row.pop("Individual_PassportNumber", None) passport_numbers = split_items(passport_number) entity.add_cast("Person", "passportNumber", passport_numbers) passport_detail = row.pop("Individual_PassportDetails", None) # passport_details = split_items(passport_detail) # TODO: where do I stuff this? ni_number = row.pop("Individual_NINumber", None) ni_numbers = split_items(ni_number) entity.add_cast("Person", "idNumber", ni_numbers) ni_detail = row.pop("Individual_NIDetails", None) # ni_details = split_items(ni_detail) # TODO: where do I stuff this? for phone in split_new(row.pop("PhoneNumber", None)): entity.add_cast("LegalEntity", "phone", phone) for email in split_new(row.pop("EmailAddress", None)): entity.add_cast("LegalEntity", "email", email) for website in split_new(row.pop("Website", None)): entity.add_cast("LegalEntity", "website", website) for name in parse_companies(context, row.pop("Entity_ParentCompany", None)): parent = context.make("Organization") parent.id = context.make_slug("named", name) parent.add("name", name) context.emit(parent) ownership = context.make("Ownership") ownership.id = context.make_id(entity.id, "owns", parent.id) ownership.add("owner", parent) ownership.add("asset", entity) context.emit(ownership) for name in parse_companies(context, row.pop("Entity_Subsidiaries", None)): subsidiary = context.make("Company") subsidiary.id = context.make_slug("named", name) subsidiary.add("name", name) context.emit(subsidiary) ownership = context.make("Ownership") ownership.id = context.make_id(entity.id, "owns", subsidiary.id) ownership.add("owner", entity) ownership.add("asset", subsidiary) context.emit(ownership) grp_status = row.pop("GrpStatus", None) if grp_status != "A": context.log.warning("Unknown GrpStatus", value=grp_status) entity.add("notes", h.clean_note(row.pop("OtherInformation", None))) h.audit_data(row, ignore=["NonLatinScriptLanguage", "NonLatinScriptType"]) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)