示例#1
0
def crawl_common(context: Context, data: Dict[str, str], part: str,
                 schema: str):
    entity = context.make(schema)
    entity.id = context.make_slug(part, data.pop("DATAID"))
    entity.add("topics", "sanction")
    entity.add("notes", h.clean_note(data.pop("COMMENTS1")))
    entity.add("notes", h.clean_note(data.pop("NOTE", None)))
    entity.add("alias", data.pop("NAME_ORIGINAL_SCRIPT"))

    h.apply_name(
        entity,
        name1=data.pop("FIRST_NAME", None),
        name2=data.pop("SECOND_NAME", None),
        name3=data.pop("THIRD_NAME", None),
        name4=data.pop("FOURTH_NAME", None),
        quiet=True,
    )

    sanction = h.make_sanction(context, entity)
    submitted_on = parse_date(data.pop("SUBMITTED_ON", None))
    listed_on = parse_date(data.pop("LISTED_ON"))
    modified_at = parse_date(data.pop("LAST_DAY_UPDATED"))
    entity.add("createdAt", submitted_on or listed_on or modified_at)
    entity.add("modifiedAt", modified_at)

    sanction.add("listingDate", submitted_on or listed_on)
    sanction.add("startDate", listed_on)
    sanction.add("program", data.pop("UN_LIST_TYPE"))
    sanction.add("program", data.pop("LIST_TYPE"))
    sanction.add("unscId", data.pop("REFERENCE_NUMBER"))
    sanction.add("authority", data.pop("SUBMITTED_BY", None))
    context.emit(sanction)
    return entity
示例#2
0
def parse_party(context: Context, distinct_party, locations, documents):
    profile = distinct_party.find("Profile")
    sub_type = ref_get("PartySubType", profile.get("PartySubTypeID"))
    schema = TYPES.get(sub_type.get("Value"))
    type_ = ref_value("PartyType", sub_type.get("PartyTypeID"))
    schema = TYPES.get(type_, schema)
    if schema is None:
        context.log.error("Unknown party type", value=type_)
        return
    party = context.make(schema)
    party.id = context.make_slug(profile.get("ID"))
    party.add("notes", h.clean_note(distinct_party.findtext("Comment")))
    party.add("sourceUrl", URL % profile.get("ID"))

    for identity in profile.findall("./Identity"):
        parts = {}
        for group in identity.findall(".//NamePartGroup"):
            type_id = group.get("NamePartTypeID")
            parts[group.get("ID")] = ref_value("NamePartType", type_id)

        for alias in identity.findall("./Alias"):
            parse_alias(party, parts, alias)

        for regdoc in documents.get(identity.get("ID"), []):
            parse_registration_doc(context, party, regdoc)

    for feature in profile.findall("./Feature"):
        parse_feature(context, feature, party, locations)

    context.emit(party, target=True)
    # pprint(party.to_dict())
    # context.log.info("[%s] %s" % (party.schema.name, party.caption))
    return party
示例#3
0
def parse_reference(context: Context, reference: int, rows):
    schemata = set()
    for row in rows:
        type_ = row.pop("type")
        schema = context.lookup_value("type", type_)
        if schema is None:
            context.log.warning("Unknown entity type", type=type_)
            return
        schemata.add(schema)
    assert len(schemata) == 1, schemata
    entity = context.make(schemata.pop())

    primary_name = None
    for row in rows:
        name = row.pop("name_of_individual_or_entity", None)
        name_type = row.pop("name_type")
        name_prop = context.lookup_value("name_type", name_type)
        if name_prop is None:
            context.log.warning("Unknown name type", name_type=name_type)
            return
        entity.add(name_prop, name)
        if name_prop == "name":
            primary_name = name

    entity.id = context.make_slug(reference, primary_name)
    sanction = h.make_sanction(context, entity)

    primary_name = None
    for row in rows:
        addr = row.pop("address")
        if addr is not None:
            for part in multi_split(addr, SPLITS):
                address = h.make_address(context, full=part)
                h.apply_address(context, entity, address)
        sanction.add("program", row.pop("committees"))
        citizen = multi_split(row.pop("citizenship"), ["a)", "b)", "c)", "d)"])
        entity.add("nationality", citizen, quiet=True)
        dates = clean_date(row.pop("date_of_birth"))
        entity.add("birthDate", dates, quiet=True)
        entity.add("birthPlace", row.pop("place_of_birth"), quiet=True)
        entity.add("notes", h.clean_note(row.pop("additional_information")))
        listing_info = row.pop("listing_information")
        if isinstance(listing_info, datetime):
            entity.add("createdAt", listing_info)
            sanction.add("listingDate", listing_info)
        else:
            sanction.add("summary", listing_info)
        # TODO: consider parsing if it's not a datetime?

        control_date = row.pop("control_date")
        sanction.add("startDate", control_date)
        entity.add("createdAt", control_date)

    entity.add("topics", "sanction")
    context.emit(entity, target=True)
    context.emit(sanction)
示例#4
0
def make_entity(context: Context, el, schema, entity_id):
    entity = context.make(schema, target=True)
    entity.id = entity_id
    entity.add("notes", h.clean_note(el.findtext("./note")))
    entity.add("topics", "sanction")

    sanction = h.make_sanction(context, entity)
    sanction.add("summary", el.findtext("./correction"))
    context.emit(sanction)

    return entity
示例#5
0
def parse_foreign_persons(context: Context, entity, text):
    while "," in text:
        text, section = text.rsplit(",", 1)
        fragment = section.strip()
        if not len(fragment):
            continue
        date = parse_format(fragment, "%d.%m.%Y г. р.")
        if date.text is not None:
            entity.add("birthDate", date)
            continue
        entity.add("notes", h.clean_note(fragment))
    parse_name(entity, text)
示例#6
0
def crawl_physical(context: Context) -> None:
    data = json_resource(context, PHYSICAL_URL, "physical")
    for row in data:
        entity = context.make("Person")
        entity.id = context.make_slug("person", row.pop("ukaz_id"),
                                      row.pop("index"))
        entity.add("name", row.pop("name_ukr", None))
        entity.add("name", row.pop("name_original", None))
        for alias in multi_split(row.pop("name_alternative", None),
                                 [";", "/"]):
            entity.add("alias", alias)
        entity.add("notes", h.clean_note(row.pop("additional", None)))
        for country in multi_split(row.pop("citizenship", None), [", "]):
            entity.add("nationality", country)
        entity.add("birthDate", row.pop("birthdate", None))
        entity.add("birthPlace", row.pop("birthplace", None))
        entity.add("position",
                   remove_control_chars(row.pop("occupation", None)))
        handle_address(context, entity, row.pop("livingplace", None))
        handle_sanction(context, entity, row)

        context.emit(entity, target=True)
示例#7
0
def parse_common(context: Context, entity, node):
    entity.id = context.make_slug(node.findtext("./DATAID"))
    h.apply_name(
        entity,
        given_name=node.findtext("./FIRST_NAME"),
        second_name=node.findtext("./SECOND_NAME"),
        name3=node.findtext("./THIRD_NAME"),
        name4=node.findtext("./FOURTH_NAME"),
        quiet=True,
    )
    entity.add("alias", node.findtext("./NAME_ORIGINAL_SCRIPT"))
    entity.add("notes", h.clean_note(node.findtext("./COMMENTS1")))
    entity.add("topics", "sanction")

    sanction = h.make_sanction(context, entity)
    entity.add("createdAt", node.findtext("./LISTED_ON"))
    sanction.add("listingDate", node.findtext("./LISTED_ON"))
    sanction.add("startDate", node.findtext("./LISTED_ON"))
    sanction.add("modifiedAt", values(node.find("./LAST_DAY_UPDATED")))
    entity.add("modifiedAt", values(node.find("./LAST_DAY_UPDATED")))
    sanction.add("program", node.findtext("./UN_LIST_TYPE"))
    sanction.add("unscId", node.findtext("./REFERENCE_NUMBER"))
    return sanction
示例#8
0
def parse_entry(context: Context, target, programs, places, updated_at):
    entity = context.make("LegalEntity")
    node = target.find("./entity")
    if node is None:
        node = target.find("./individual")
        entity = context.make("Person")
    if node is None:
        node = target.find("./object")
        object_type = node.get("object-type")
        if object_type != "vessel":
            context.log.warning("Unknown target type",
                                target=target,
                                object_type=object_type)
        entity = context.make("Vessel")

    entity.id = context.make_slug(target.get("ssid"))
    entity.add("gender", node.get("sex"), quiet=True)
    for other in node.findall("./other-information"):
        value = other.text.strip()
        if entity.schema.is_a("Vessel") and value.lower().startswith("imo"):
            _, imo_num = value.split(":", 1)
            entity.add("imoNumber", imo_num)
        else:
            entity.add("notes", h.clean_note(value))

    sanction = h.make_sanction(context, entity)
    dates = set()
    for mod in target.findall("./modification"):
        dates.add(mod.get("publication-date"))
        sanction.add("listingDate", mod.get("publication-date"))
        sanction.add("startDate", mod.get("effective-date"))
    dates_ = [d for d in dates if d is not None]
    if len(dates_):
        entity.add("createdAt", min(dates_))
        entity.add("modifiedAt", max(dates_))

    ssid = target.get("sanctions-set-id")
    sanction.add("program", programs.get(ssid))

    for justification in node.findall("./justification"):
        # TODO: should this go into sanction:reason?
        entity.add("notes", h.clean_note(justification.text))

    for relation in node.findall("./relation"):
        rel_type = relation.get("relation-type")
        target_id = context.make_slug(relation.get("target-id"))
        res = context.lookup("relations", rel_type)
        if res is None:
            context.log.warn(
                "Unknown relationship type",
                type=rel_type,
                source=entity,
                target=target_id,
            )
            continue

        rel = context.make(res.schema)
        rel.id = context.make_slug(relation.get("ssid"))
        rel.add(res.source, entity.id)
        rel.add(res.target, target_id)
        rel.add(res.text, rel_type)

        # rel_target = context.make(rel.schema.get(res.target).range)
        # rel_target.id = target_id
        # context.emit(rel_target)

        entity.add_schema(rel.schema.get(res.source).range)
        context.emit(rel)

    for identity in node.findall("./identity"):
        parse_identity(context, entity, identity, places)

    entity.add("topics", "sanction")
    context.emit(entity, target=True)
    context.emit(sanction)
示例#9
0
def crawl_organizations(context: Context):
    path = context.fetch_resource("organizations.xlsx", ORG_URL)
    context.export_resource(path, XLSX, title=context.SOURCE_TITLE)
    seq_ids = {}
    links = []
    for record in excel_records(path):
        seq_id = record.pop("internal_seq_id", None)
        name_en = record.pop("organization_name_english", None)
        name_he = record.pop("organization_name_hebrew", None)
        entity = context.make("Organization")
        entity.id = context.make_id(name_en, name_he)
        if entity.id is None:
            continue
        if seq_id is not None:
            seq_ids[seq_id] = entity.id
        entity.add("name", name_en)
        entity.add("name", name_he)
        entity.add("topics", "crime.terror")
        entity.add("notes", h.clean_note(lang_pick(record, "comments")))
        entity.add("notes", h.clean_note(record.pop("column_42", None)))
        entity.add("email", record.pop("email", None))
        entity.add("country", record.pop("country_hebrew", None))
        entity.add("country", record.pop("country_english", None))
        entity.add("registrationNumber", record.pop("corporation_id", None))
        entity.add("legalForm", lang_pick(record, "corporation_type"))
        entity.add("jurisdiction", lang_pick(record, "location_of_formation"))
        date = parse_date(record.pop("date_of_corporation", None))
        entity.add("incorporationDate", date)
        for field in list(record.keys()):
            if field.startswith("organization_name_"):
                entity.add("alias", record.pop(field, None))
            if field.startswith("telephone"):
                entity.add("phone", record.pop(field, None))
            if field.startswith("website"):
                entity.add("website", record.pop(field, None))

        entity.add("phone", record.pop("column_70", None))
        entity.add("website", record.pop("column_73", None))

        sanction = h.make_sanction(context, entity)
        sanction.add("recordId", seq_id)
        sanction.add("recordId", record.pop("seq_num_in_other_countries", None))
        sanction.add("program", record.pop("designation_type", None))
        sanction.add("reason", lang_pick(record, "designation_justification"))
        sanction.add("authority", lang_pick(record, "designated_by"))
        sanction.add("publisher", record.pop("public_records_references", None))

        lang_pick(record, "designated_by_abroad")
        record.pop("date_designated_in_other_countries", None)

        linked = record.pop("linked_to_internal_seq_id", "")
        for link in linked.split(";"):
            links.append((max(link, seq_id), min(link, seq_id)))

        street = lang_pick(record, "street")
        city = lang_pick(record, "city_village")
        if street or city:
            address = h.make_address(
                context, street=street, city=city, country_code=entity.first("country")
            )
            h.apply_address(context, entity, address)

        for field in (
            "date_of_temporary_designation",
            "date_of_permenant_designation",
            "date_designation_in_west_bank",
        ):
            parse_interval(sanction, record.pop(field, None))

        context.emit(entity, target=True)
        context.emit(sanction)
        h.audit_data(record)

    for (subject, object) in links:
        subject_id = seq_ids.get(subject)
        object_id = seq_ids.get(object)
        if subject_id is None or object_id is None:
            continue
        link = context.make("UnknownLink")
        link.id = context.make_id(subject_id, object_id)
        link.add("subject", subject_id)
        link.add("object", object_id)
        context.emit(link)
示例#10
0
def emit_row(context: Context, sheet: str, section: str, row: Dict[str, List[str]]):
    schema = context.lookup_value("schema", section)
    if schema is None:
        context.log.warning("No schema for section", section=section, sheet=sheet)
        return
    entity = context.make(schema)
    name_english = row.pop("name_english")
    name_japanese = row.pop("name_japanese")
    entity.id = context.make_id(*name_english, *name_japanese)
    if entity.id is None:
        # context.pprint((sheet, row))
        return
    entity.add("name", parse_names(name_english))
    if not entity.has("name"):
        entity.add("name", parse_names(name_japanese))
    else:
        entity.add("alias", parse_names(name_japanese))

    entity.add("alias", parse_names(row.pop("alias", [])))
    entity.add("alias", parse_names(row.pop("known_alias", [])))
    entity.add("weakAlias", parse_names(row.pop("weak_alias", [])))
    entity.add("weakAlias", parse_names(row.pop("nickname", [])))
    entity.add("previousName", parse_names(row.pop("past_alias", [])))
    entity.add("previousName", parse_names(row.pop("old_name", [])))
    entity.add_cast("Person", "position", row.pop("position", []))
    birth_date = parse_date(row.pop("birth_date", []))
    entity.add_cast("Person", "birthDate", birth_date)
    entity.add_cast("Person", "birthPlace", row.pop("birth_place", []))
    entity.add_cast("Person", "passportNumber", row.pop("passport_number", []))
    entity.add("idNumber", row.pop("id_number", []))
    entity.add("idNumber", row.pop("identification_number", []))
    entity.add("notes", h.clean_note(row.pop("other_information", None)))
    entity.add("notes", h.clean_note(row.pop("details", None)))
    entity.add("phone", row.pop("phone", []))
    entity.add("phone", row.pop("fax", []))

    for address_full in row.pop("address", []):
        address = h.make_address(context, full=address_full)
        h.apply_address(context, entity, address)

    for address_full in row.pop("where", []):
        address = h.make_address(context, full=address_full)
        h.apply_address(context, entity, address)

    title = row.pop("title", [])
    if entity.schema.is_a("Person"):
        entity.add("title", title)
    else:
        entity.add("notes", title)
    entity.add("country", row.pop("citizenship", []))
    entity.add("country", row.pop("activity_area", []))

    sanction = h.make_sanction(context, entity)
    sanction.add("program", section)
    sanction.add("reason", row.pop("root_nomination", None))
    sanction.add("reason", row.pop("reason_res1483", None))
    sanction.add("authorityId", row.pop("notification_number", None))
    sanction.add("unscId", row.pop("designated_un", None))

    sanction.add("startDate", parse_date(row.pop("notification_date", [])))
    sanction.add("startDate", parse_date(row.pop("designated_date", [])))
    sanction.add("listingDate", parse_date(row.pop("publication_date", [])))

    # if len(row):
    #     context.pprint(row)
    entity.add("topics", "sanction")
    context.emit(entity, target=True)
    context.emit(sanction)
示例#11
0
def apply_prop(context: Context, entity, sanction, field, value):
    if field == "ALIAS":
        entity.add("alias", value.pop("Alias"))
    elif field == "SEXE":
        entity.add("gender", value.pop("Sexe"))
    elif field == "PRENOM":
        entity.add("firstName", value.pop("Prenom"))
    elif field == "NATIONALITE":
        entity.add("nationality", value.pop("Pays"))
    elif field == "TITRE":
        entity.add("position", value.pop("Titre"))
    elif field == "SITE_INTERNET":
        entity.add("website", value.pop("SiteInternet"))
    elif field == "TELEPHONE":
        entity.add("phone", value.pop("Telephone"))
    elif field == "COURRIEL":
        entity.add("email", value.pop("Courriel"))
    elif field == "NUMERO_OMI":
        entity.add("imoNumber", value.pop("NumeroOMI"))
    elif field == "DATE_DE_NAISSANCE":
        date = parse_parts(value.pop("Annee"), value.pop("Mois"),
                           value.pop("Jour"))
        entity.add("birthDate", date)
    elif field in ("ADRESSE_PM", "ADRESSE_PP"):
        address = h.make_address(
            context,
            full=value.pop("Adresse"),
            country=value.pop("Pays"),
        )
        h.apply_address(context, entity, address)
    elif field == "LIEU_DE_NAISSANCE":
        entity.add("birthPlace", value.pop("Lieu"))
        entity.add("country", value.pop("Pays"))
    elif field == "PASSEPORT":
        entity.add("passportNumber", value.pop("NumeroPasseport"))
    elif field == "IDENTIFICATION":
        comment = value.pop("Commentaire")
        content = value.pop("Identification")
        result = context.lookup("identification", comment)
        if result is None:
            context.log.warning(
                "Unknown Identification type",
                comment=comment,
                content=content,
            )
        elif result.prop is not None:
            schema = result.schema or entity.schema
            entity.add_cast(schema, result.prop, content)
            if result.prop == "notes":
                entity.add(result.prop, h.clean_note(comment))
    elif field == "AUTRE_IDENTITE":
        entity.add("idNumber", value.pop("NumeroCarte"))
    elif field == "REFERENCE_UE":
        sanction.add("authorityId", value.pop("ReferenceUe"))
    elif field == "REFERENCE_ONU":
        sanction.add("unscId", value.pop("ReferenceOnu"))
    elif field == "FONDEMENT_JURIDIQUE":
        sanction.add("program", value.pop("FondementJuridiqueLabel"))
        # TODO: derive target countries?
    elif field == "MOTIFS":
        motifs = value.pop("Motifs")
        sanction.add("reason", motifs)
        entity.add("notes", motifs)
    else:
        context.log.warning("Unknown field", field=field, value=value)
示例#12
0
def parse_entry(context: Context, entry: Element):
    subject_type = entry.find("./subjectType")
    schema = context.lookup_value(
        "subject_type",
        subject_type.get("code"),
        dataset="eu_fsf",
    )
    if schema is None:
        context.log.warning("Unknown subject type", type=subject_type)
        return

    entity = context.make(schema)
    eu_ref = entry.get("euReferenceNumber")
    if eu_ref is not None:
        entity.id = context.make_slug(eu_ref, dataset="eu_fsf")
    else:
        entity.id = context.make_slug("logical", entry.get("logicalId"))
    entity.add("notes", h.clean_note(entry.findtext("./remark")))
    entity.add("topics", "sanction")
    parse_sanctions(context, entity, entry)

    for name in entry.findall("./nameAlias"):
        is_weak = not as_bool(name.get("strong"))
        h.apply_name(
            entity,
            full=name.get("wholeName"),
            first_name=name.get("firstName"),
            middle_name=name.get("middleName"),
            last_name=name.get("lastName"),
            is_weak=is_weak,
            quiet=True,
        )
        entity.add("title", name.get("title"), quiet=True)
        entity.add("position", name.get("function"), quiet=True)
        entity.add("gender", name.get("gender"), quiet=True)

    for node in entry.findall("./identification"):
        type = node.get("identificationTypeCode")
        schema = "Passport" if type == "passport" else "Identification"
        passport = context.make(schema)
        passport.id = context.make_id("ID", entity.id, node.get("logicalId"))
        passport.add("holder", entity)
        passport.add("authority", node.get("issuedBy"))
        passport.add("type", node.get("identificationTypeDescription"))
        passport.add("number", node.get("number"))
        passport.add("number", node.get("latinNumber"))
        passport.add("startDate", node.get("issueDate"))
        passport.add("startDate", node.get("issueDate"))
        passport.add("country", parse_country(node))
        passport.add("country", node.get("countryDescription"))
        for remark in node.findall("./remark"):
            passport.add("summary", remark.text)
        context.emit(passport)

    for node in entry.findall("./address"):
        address = parse_address(context, node)
        h.apply_address(context, entity, address)

        for child in node.getchildren():
            if child.tag in ("regulationSummary"):
                continue
            elif child.tag == "remark":
                entity.add("notes", child.text)
            elif child.tag == "contactInfo":
                prop = context.lookup_value(
                    "contact_info",
                    child.get("key"),
                    dataset="eu_fsf",
                )
                if prop is None:
                    context.log.warning("Unknown contact info", node=child)
                else:
                    entity.add(prop, child.get("value"))
            else:
                context.log.warning("Unknown address component", node=child)

    for birth in entry.findall("./birthdate"):
        partialBirth = parse_parts(birth.get("year"), birth.get("month"),
                                   birth.get("day"))
        entity.add("birthDate", birth.get("birthdate"))
        entity.add("birthDate", partialBirth)
        address = parse_address(context, birth)
        if address is not None:
            entity.add("birthPlace", address.get("full"))
            entity.add("country", address.get("country"))

    for node in entry.findall("./citizenship"):
        entity.add("nationality", parse_country(node), quiet=True)
        entity.add("nationality", node.get("countryDescription"), quiet=True)

    context.emit(entity, target=True)
示例#13
0
def crawl_row(context: Context, data: Dict[str, str]):
    entity = context.make("LegalEntity")
    ind_id = data.pop("INDIVIDUAL_Id", data.pop("IndividualID"))
    entity.id = context.make_slug(ind_id)
    assert entity.id, data
    entity.add("notes", h.clean_note(data.pop("COMMENTS", None)))
    entity.add("notes", h.clean_note(data.pop("Comments", None)))
    entity.add("notes", h.clean_note(data.pop("NOTE", None)))
    entity.add("notes", h.clean_note(data.pop("NOTE1", None)))
    entity.add("notes", h.clean_note(data.pop("NOTE2", None)))
    entity.add("notes", h.clean_note(data.pop("NOTE3", None)))
    entity.add_cast("Person", "nationality", data.pop("NATIONALITY", None))
    entity.add_cast("Person", "nationality", data.pop("Nationality", None))
    entity.add_cast("Person", "title", data.pop("TITLE", None))
    entity.add_cast("Person", "title", data.pop("Title", None))
    entity.add_cast("Person", "position", data.pop("DESIGNATION", None))
    entity.add_cast("Person", "position", data.pop("Designation", None))
    entity.add_cast("Person", "birthPlace", data.pop("PLACEOFBIRTH", None))
    entity.add_cast("Person", "birthPlace",
                    data.pop("IndividualPlaceOfBirth", None))
    entity.add_cast("Person", "birthPlace", data.pop("CITY_OF_BIRTH", None))
    entity.add_cast("Person", "birthDate", data.pop("YEAR", None))
    entity.add_cast("Person", "gender", data.pop("GENDER", None))
    entity.add_cast("Person", "birthDate", parse_date(data.pop("DATE", None)))
    entity.add_cast("Person", "birthDate",
                    parse_date(data.pop("DATE_OF_BIRTH", None)))
    dob = parse_date(data.pop("IndividualDateOfBirth", None))
    entity.add_cast("Person", "birthDate", dob)

    data.pop("BIRTHPLACE_x0020_CITY", None)
    data.pop("BIRTHPLACE_x0020_STATE_PROVINCE", None)
    entity.add("country", data.pop("BIRTHPLACE_x0020_COUNTRY", None))
    entity.add("country", data.pop("COUNTRY_OF_BIRTH", None))
    entity.add_cast("Person", "birthPlace",
                    data.pop("BIRTHPLACE_x0020_NOTE", None))

    h.apply_name(
        entity,
        full=data.pop("FullName", None),
        given_name=data.pop("FIRST_NAME", None),
        second_name=data.pop("SECOND_NAME", None),
        name3=data.pop("THIRD_NAME", None),
        name4=data.pop("FOURTH_NAME", None),
        quiet=True,
    )

    alias = data.pop("NAME_ORIGINAL_SCRIPT", None)
    if alias is not None and "?" not in alias:
        entity.add("alias", alias)
    entity.add("alias", data.pop("SORT_KEY", None))
    data.pop("IndividualAlias", None)

    entity.add_cast("Person", "passportNumber", data.pop("PASSPORT", None))
    entity.add_cast("Person", "passportNumber",
                    data.pop("IndividualDocument", None))
    data.pop("DATE_OF_ISSUE", None)
    data.pop("CITY_OF_ISSUE", None)
    entity.add("country", data.pop("COUNTRY_OF_ISSUE", None))
    entity.add_cast("Person", "idNumber", data.pop("IDNUMBER", None))

    address = h.make_address(
        context,
        # remarks=data.pop("NOTE"),
        full=data.pop("IndividualAddress", None),
        street=data.pop("STREET", None),
        city=data.pop("CITY", None),
        region=data.pop("STATE_PROVINCE", None),
        postal_code=data.pop("ZIP_CODE", None),
        country=data.pop("COUNTRY", None),
    )
    h.apply_address(context, entity, address)

    sanction = h.make_sanction(context, entity)
    inserted_at = parse_date(data.pop("DateInserted", None))
    listed_on = data.pop("ListedON", data.pop("ListedOn", None))
    listed_at = parse_date(listed_on)
    entity.add("createdAt", inserted_at or listed_at)
    sanction.add("listingDate", listed_at or inserted_at)
    sanction.add("startDate", data.pop("FROM_YEAR", None))
    sanction.add("endDate", data.pop("TO_YEAR", None))
    sanction.add("program", data.pop("UN_LIST_TYPE", None))
    sanction.add("unscId", data.pop("REFERENCE_NUMBER", None))
    sanction.add("unscId", data.pop("ReferenceNumber", None))
    sanction.add("authority", data.pop("SUBMITTED_BY", None))

    entity.add("topics", "sanction")
    h.audit_data(data,
                 ignore=["VERSIONNUM", "TYPE_OF_DATE", "ApplicationStatus"])
    context.emit(entity, target=True)
    context.emit(sanction)
示例#14
0
def parse_row(context: Context, row):
    group_type = row.pop("GroupTypeDescription")
    schema = TYPES.get(group_type)
    if schema is None:
        context.log.error("Unknown group type", group_type=group_type)
        return
    entity = context.make(schema)
    entity.id = context.make_slug(row.pop("GroupID"))
    sanction = h.make_sanction(context, entity)
    sanction.add("program", row.pop("RegimeName"))
    sanction.add("authority", row.pop("ListingType", None))
    listed_date = h.parse_date(row.pop("DateListed"), FORMATS)
    sanction.add("listingDate", listed_date)
    designated_date = h.parse_date(row.pop("DateDesignated"), FORMATS)
    sanction.add("startDate", designated_date)

    entity.add("createdAt", listed_date)
    if not entity.has("createdAt"):
        entity.add("createdAt", designated_date)

    sanction.add("authorityId", row.pop("UKSanctionsListRef", None))
    sanction.add("unscId", row.pop("UNRef", None))
    sanction.add("status", row.pop("GroupStatus", None))
    sanction.add("reason", row.pop("UKStatementOfReasons", None))

    last_updated = h.parse_date(row.pop("LastUpdated"), FORMATS)
    sanction.add("modifiedAt", last_updated)
    entity.add("modifiedAt", last_updated)

    # TODO: derive topics and schema from this??
    entity_type = row.pop("Entity_Type", None)
    entity.add_cast("LegalEntity", "legalForm", entity_type)

    reg_number = row.pop("Entity_BusinessRegNumber", None)
    entity.add_cast("LegalEntity", "registrationNumber", reg_number)

    row.pop("Ship_Length", None)
    entity.add_cast("Vessel", "flag", row.pop("Ship_Flag", None))
    flags = split_new(row.pop("Ship_PreviousFlags", None))
    entity.add_cast("Vessel", "pastFlags", flags)
    entity.add_cast("Vessel", "type", row.pop("Ship_Type", None))
    entity.add_cast("Vessel", "tonnage", row.pop("Ship_Tonnage", None))
    entity.add_cast("Vessel", "buildDate", row.pop("Ship_YearBuilt", None))
    entity.add_cast("Vessel", "imoNumber", row.pop("Ship_IMONumber", None))

    ship_owner = row.pop("Ship_CurrentOwners", None)
    if ship_owner is not None:
        owner = context.make("LegalEntity")
        owner.id = context.make_slug("named", ship_owner)
        owner.add("name", ship_owner)
        context.emit(owner)

        ownership = context.make("Ownership")
        ownership.id = context.make_id(entity.id, "owns", owner.id)
        ownership.add("owner", owner)
        ownership.add("asset", entity)
        context.emit(ownership)

    countries = parse_countries(row.pop("Country", None))
    entity.add("country", countries)

    title = split_items(row.pop("Title", None))
    entity.add("title", title, quiet=True)

    pobs = split_items(row.pop("Individual_TownOfBirth", None))
    entity.add_cast("Person", "birthPlace", pobs)

    dob = h.parse_date(row.pop("Individual_DateOfBirth", None), FORMATS)
    entity.add_cast("Person", "birthDate", dob)

    cob = parse_countries(row.pop("Individual_CountryOfBirth", None))
    entity.add_cast("Person", "country", cob)

    nationalities = parse_countries(row.pop("Individual_Nationality", None))
    entity.add_cast("Person", "nationality", nationalities)

    positions = split_items(row.pop("Individual_Position", None))
    entity.add_cast("Person", "position", positions)

    entity.add_cast("Person", "gender", row.pop("Individual_Gender", None))

    name_type = row.pop("AliasType", None)
    name_prop = NAME_TYPES.get(name_type)
    if name_prop is None:
        context.log.warning("Unknown name type", type=name_type)
        return
    name_quality = row.pop("AliasQuality", None)
    is_weak = WEAK_QUALITY.get(name_quality)
    if is_weak is None:
        context.log.warning("Unknown name quality", quality=name_quality)
        return

    h.apply_name(
        entity,
        name1=row.pop("name1", None),
        name2=row.pop("name2", None),
        name3=row.pop("name3", None),
        name4=row.pop("name4", None),
        name5=row.pop("name5", None),
        tail_name=row.pop("Name6", None),
        name_prop=name_prop,
        is_weak=is_weak,
        quiet=True,
    )
    entity.add("alias", row.pop("NameNonLatinScript", None))

    full_address = join_text(
        row.pop("Address1", None),
        row.pop("Address2", None),
        row.pop("Address3", None),
        row.pop("Address4", None),
        row.pop("Address5", None),
        row.pop("Address6", None),
        sep=", ",
    )

    address = h.make_address(
        context,
        full=full_address,
        postal_code=row.pop("PostCode", None),
        country=first(countries),
    )
    h.apply_address(context, entity, address)

    passport_number = row.pop("Individual_PassportNumber", None)
    passport_numbers = split_items(passport_number)
    entity.add_cast("Person", "passportNumber", passport_numbers)
    passport_detail = row.pop("Individual_PassportDetails", None)
    # passport_details = split_items(passport_detail)
    # TODO: where do I stuff this?

    ni_number = row.pop("Individual_NINumber", None)
    ni_numbers = split_items(ni_number)
    entity.add_cast("Person", "idNumber", ni_numbers)
    ni_detail = row.pop("Individual_NIDetails", None)
    # ni_details = split_items(ni_detail)
    # TODO: where do I stuff this?

    for phone in split_new(row.pop("PhoneNumber", None)):
        entity.add_cast("LegalEntity", "phone", phone)

    for email in split_new(row.pop("EmailAddress", None)):
        entity.add_cast("LegalEntity", "email", email)

    for website in split_new(row.pop("Website", None)):
        entity.add_cast("LegalEntity", "website", website)

    for name in parse_companies(context, row.pop("Entity_ParentCompany",
                                                 None)):
        parent = context.make("Organization")
        parent.id = context.make_slug("named", name)
        parent.add("name", name)
        context.emit(parent)

        ownership = context.make("Ownership")
        ownership.id = context.make_id(entity.id, "owns", parent.id)
        ownership.add("owner", parent)
        ownership.add("asset", entity)
        context.emit(ownership)

    for name in parse_companies(context, row.pop("Entity_Subsidiaries", None)):
        subsidiary = context.make("Company")
        subsidiary.id = context.make_slug("named", name)
        subsidiary.add("name", name)
        context.emit(subsidiary)

        ownership = context.make("Ownership")
        ownership.id = context.make_id(entity.id, "owns", subsidiary.id)
        ownership.add("owner", entity)
        ownership.add("asset", subsidiary)
        context.emit(ownership)

    grp_status = row.pop("GrpStatus", None)
    if grp_status != "A":
        context.log.warning("Unknown GrpStatus", value=grp_status)

    entity.add("notes", h.clean_note(row.pop("OtherInformation", None)))
    h.audit_data(row, ignore=["NonLatinScriptLanguage", "NonLatinScriptType"])

    entity.add("topics", "sanction")
    context.emit(entity, target=True)
    context.emit(sanction)
示例#15
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r") as fh:
        doc = html.parse(fh)
    tables = doc.findall(".//table")
    assert len(tables) == 1
    rows = tables[0].findall(".//tr")
    for row in rows[2:]:
        cells = [
            collapse_spaces(c.text_content()) for c in row.findall("./td")
        ]
        index = cells[0]
        body = cells[1]
        decision = cells[2]
        un_id = cells[3]
        listing_date = cells[4]
        entity = context.make("Thing")
        entity.id = context.make_slug(index, un_id)
        entity.add("notes", h.clean_note(cells[5]))

        sanction = h.make_sanction(context, entity)
        sanction.add("listingDate", clean_date(listing_date))
        sanction.add("program", decision)
        sanction.add("recordId", un_id)

        body, gender = maybe_rsplit(body, "пол:")
        entity.add_cast("Person", "gender", gender)
        body, gender = maybe_rsplit(body, "Пол:")
        entity.add_cast("Person", "gender", gender)
        body, location = maybe_rsplit(body, "местонахождение:")
        entity.add_cast("LegalEntity", "country", location)
        body, imo_num = maybe_rsplit(body, "Присвоенный ИМО номер компании:")
        body, imo_num = maybe_rsplit(body, "Номер ИМО:")
        body, emails = maybe_rsplit(body, "Адрес эл. почты:")
        for email in letter_split(emails):
            entity.add_cast("LegalEntity", "email", email)
        body, fax = maybe_rsplit(body, "Номер факса:")
        body, fax = maybe_rsplit(body, "Факс:")
        body, phones = maybe_rsplit(body, "Номера телефонов:")
        for phone in letter_split(phones):
            entity.add_cast("LegalEntity", "phone", phone)
        body, phones = maybe_rsplit(body, "Тел.:")
        for phone in letter_split(phones):
            entity.add_cast("LegalEntity", "phone", phone)
        body, swift = maybe_rsplit(body, "СВИФТ-код:")
        entity.add_cast("LegalEntity", "swiftBic", swift)
        body, swift = maybe_rsplit(body, "СВИФТ/БИК-код:")
        entity.add_cast("LegalEntity", "swiftBic", swift)

        body, other_info = maybe_rsplit(body, "Прочая информация:")
        entity.add_cast("Thing", "notes", other_info)
        body, listing_date = maybe_rsplit(body, "Дата внесения в перечень:")
        body, addresses = maybe_rsplit(body, "Адрес:")
        for address in letter_split(addresses):
            country = address
            if ", " in country:
                country = address.rsplit(", ", 1)
            code = registry.country.clean(country, fuzzy=True)
            obj = h.make_address(context, full=address, country_code=code)
            h.apply_address(context, entity, obj)
            entity.add("country", code)
        body, national_ids = maybe_rsplit(
            body, "Национальный идентификационный номер:")
        for national_id in letter_split(national_ids):
            entity.add_cast("LegalEntity", "idNumber", national_id)
        body, passport_nos = maybe_rsplit(body, "Паспорт №:")
        for passport_no in letter_split(passport_nos):
            entity.add_cast("Person", "passportNumber", passport_no)
        body, citizenship = maybe_rsplit(body, "Гражданство:")
        entity.add_cast("Person", "nationality", citizenship)
        aka = "На основании менее достоверных источников также известен как:"
        body, aka = maybe_rsplit(body, aka)
        entity.add("alias", letter_split(aka))
        strong_aka = "На основании достоверных источников также известен как:"
        body, strong_aka = maybe_rsplit(body, strong_aka)
        entity.add("alias", letter_split(strong_aka))
        body, rik_no = maybe_rsplit(body, "Р.И.К.:")

        body, birth_place = maybe_rsplit(body, "Место рождения:")
        entity.add_cast("Person", "birthPlace", birth_place)
        body, birth_dates = maybe_rsplit(body, "Дата рождения:")
        for birth_date in letter_split(birth_dates):
            entity.add_cast("Person", "birthDate", clean_date(birth_date))
        body, position = maybe_rsplit(body, "Должность:")
        entity.add_cast("Person", "position", position)
        body, job = maybe_rsplit(body, "Обращение:")
        entity.add_cast("Person", "position", job)

        body, aliases = maybe_rsplit(body, "Другие названия:")
        entity.add("alias", letter_split(aliases))

        body, aliases = maybe_rsplit(body, "Вымышленные названия:")
        entity.add("alias", letter_split(aliases))

        names = body.split(", ")
        entity.add("name", names)
        # context.pprint(names)

        if entity.schema.name == "Thing":
            entity.schema = model.get("LegalEntity")

        context.emit(entity, target=True)
        context.emit(sanction)