예제 #1
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r", encoding="utf-8") as fh:
        doc = html.fromstring(fh.read())
    for table in doc.findall('.//div[@class="editor-content"]//table'):
        headers = None
        schema = None
        for row in table.findall(".//tr"):
            cells = [
                collapse_spaces(c.text_content()) for c in row.findall("./td")
            ]
            if headers is None:
                headers = [slugify(c, sep="_") for c in cells]
                continue
            if len(cells) == 1:
                schema = TYPES[cells[0]]
                continue
            row = dict(zip(headers, cells))

            entity = context.make(schema)
            name = row.pop("imie_i_nazwisko_nazwa_podmiotu")
            entity.id = context.make_slug(name)
            names = name.split("(")
            entity.add("name", names[0])
            for alias in names[1:]:
                entity.add("alias", alias.split(")")[0])
            notes = row.pop("uzasadnienie_wpisu_na_liste")
            entity.add("notes", notes)

            details = row.pop("dane_identyfikacyjne_osoby_podmiotu")
            for (chop, prop) in CHOPSKA:
                parts = details.rsplit(chop, 1)
                details = parts[0]
                if len(parts) > 1:
                    if prop == "address":
                        addr = h.make_address(context, full=parts[1])
                        h.apply_address(context, entity, addr)
                    else:
                        entity.add(prop, parts[1])
            if len(details.strip()):
                result = context.lookup("details", details)
                if result is None:
                    context.log.warning("Unhandled details", details=details)
                else:
                    for prop, value in result.props.items():
                        entity.add(prop, value)

            sanction = h.make_sanction(context, entity)
            provisions = row.pop("zastosowane_srodki_sankcyjne")
            sanction.add("provisions", provisions)

            start_date = row.pop("data_umieszczenia_na_liscie")
            start_date = start_date.replace(" r.", "")
            sanction.add("startDate", h.parse_date(start_date, ["%d.%m.%Y"]))

            h.audit_data(row)
            context.emit(entity, target=True)
            context.emit(sanction)
예제 #2
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r") as fh:
        doc = html.parse(fh)

    for node in doc.findall(".//td[@class='tailSTxt']"):
        if not node.text_content().startswith("2."):
            continue
        for item in node.findall(".//tr"):
            number = item.find(".//td[@class='sProvP1No']").text_content()
            text = item.findtext(".//td[@class='sProvP1']")
            text = text.strip().rstrip(";").rstrip(".")
            name, _ = text.split("(", 1)
            names = multi_split(name, ["s/o", "@"])

            entity = context.make("Person")
            entity.id = context.make_slug(number, name)
            entity.add("name", names)
            entity.add("topics", "sanction")

            sanction = h.make_sanction(context, entity)
            sanction.add("program", PROGRAM)

            for match in IN_BRACKETS.findall(text):
                # match = match.replace("\xa0", "")
                res = context.lookup("props", match)
                if res is not None:
                    for prop, value in res.props.items():
                        entity.add(prop, value)
                    continue
                if match.endswith("citizen"):
                    nat = match.replace("citizen", "")
                    entity.add("nationality", nat)
                    continue
                if match.startswith(DOB):
                    dob = match.replace(DOB, "").strip()
                    entity.add("birthDate", h.parse_date(dob, ["%d %B %Y"]))
                    continue
                if match.startswith(PASSPORT):
                    passport = match.replace(PASSPORT, "").strip()
                    entity.add("passportNumber", passport)
                    continue
                context.log.warn("Unparsed bracket term", term=match)

            context.emit(entity, target=True)
            context.emit(sanction)
예제 #3
0
def parse_entry(context: Context, target, programs, places, updated_at):
    entity = context.make("LegalEntity")
    node = target.find("./entity")
    if node is None:
        node = target.find("./individual")
        entity = context.make("Person")
    if node is None:
        node = target.find("./object")
        object_type = node.get("object-type")
        if object_type != "vessel":
            context.log.warning("Unknown target type",
                                target=target,
                                object_type=object_type)
        entity = context.make("Vessel")

    entity.id = context.make_slug(target.get("ssid"))
    entity.add("gender", node.get("sex"), quiet=True)
    for other in node.findall("./other-information"):
        value = other.text.strip()
        if entity.schema.is_a("Vessel") and value.lower().startswith("imo"):
            _, imo_num = value.split(":", 1)
            entity.add("imoNumber", imo_num)
        else:
            entity.add("notes", h.clean_note(value))

    sanction = h.make_sanction(context, entity)
    dates = set()
    for mod in target.findall("./modification"):
        dates.add(mod.get("publication-date"))
        sanction.add("listingDate", mod.get("publication-date"))
        sanction.add("startDate", mod.get("effective-date"))
    dates_ = [d for d in dates if d is not None]
    if len(dates_):
        entity.add("createdAt", min(dates_))
        entity.add("modifiedAt", max(dates_))

    ssid = target.get("sanctions-set-id")
    sanction.add("program", programs.get(ssid))

    for justification in node.findall("./justification"):
        # TODO: should this go into sanction:reason?
        entity.add("notes", h.clean_note(justification.text))

    for relation in node.findall("./relation"):
        rel_type = relation.get("relation-type")
        target_id = context.make_slug(relation.get("target-id"))
        res = context.lookup("relations", rel_type)
        if res is None:
            context.log.warn(
                "Unknown relationship type",
                type=rel_type,
                source=entity,
                target=target_id,
            )
            continue

        rel = context.make(res.schema)
        rel.id = context.make_slug(relation.get("ssid"))
        rel.add(res.source, entity.id)
        rel.add(res.target, target_id)
        rel.add(res.text, rel_type)

        # rel_target = context.make(rel.schema.get(res.target).range)
        # rel_target.id = target_id
        # context.emit(rel_target)

        entity.add_schema(rel.schema.get(res.source).range)
        context.emit(rel)

    for identity in node.findall("./identity"):
        parse_identity(context, entity, identity, places)

    entity.add("topics", "sanction")
    context.emit(entity, target=True)
    context.emit(sanction)
예제 #4
0
def crawl_person(context: Context, data: Dict[str, Any]):
    is_pep = data.pop("is_pep", False)
    entity = context.make("Person", target=is_pep)
    wikidata_id = clean_wdid(data.pop("wikidata_id", None))
    entity.id = person_id(context, data.pop("id"), wikidata_id)
    entity.add("sourceUrl", data.pop("url_en", None))
    data.pop("url_ru", None)
    entity.add("modifiedAt", data.pop("last_change", None))
    entity.add("wikidataId", wikidata_id)
    entity.add("name", data.pop("full_name_en", None))
    entity.add("name", data.pop("full_name_ru", None))
    entity.add("alias", data.pop("inversed_full_name_en", None))
    entity.add("alias", data.pop("inversed_full_name_ru", None))
    entity.add("alias", data.pop("also_known_as_en", None))
    entity.add("alias", data.pop("also_known_as_ru", None))
    entity.add("alias", split_names(data.pop("names", [])))
    entity.add("birthDate", parse_date(data.pop("date_of_birth", None)))
    entity.add("deathDate", parse_date(data.pop("termination_date_human", None)))
    entity.add("birthPlace", data.pop("city_of_birth_ru", None))
    entity.add("birthPlace", data.pop("city_of_birth_en", None))
    entity.add("innCode", data.pop("inn", None))
    entity.add("firstName", data.pop("first_name_en", None))
    entity.add("firstName", data.pop("first_name_ru", None))
    entity.add("fatherName", data.pop("patronymic_en", None))
    entity.add("fatherName", data.pop("patronymic_ru", None))
    entity.add("lastName", data.pop("last_name_en", None))
    entity.add("lastName", data.pop("last_name_ru", None))

    for suffix in ("", "_en", "_ru"):
        role = data.pop(f"last_job_title{suffix}", None)
        org = data.pop(f"last_workplace{suffix}", None)
        if org is None or not len(org.strip()):
            continue
        position = org
        if role is not None and len(role.strip()):
            position = f"{org} ({role})"
        entity.add("position", position)

    for country_data in data.pop("related_countries", []):
        rel_type = country_data.pop("relationship_type")
        country_name = country_data.pop("to_country_en", None)
        country_name = country_name or country_data.pop("to_country_ru")
        # print(country_name)
        res = context.lookup("country_links", rel_type)
        if res is None:
            context.log.warn(
                "Unknown country link",
                rel_type=rel_type,
                entity=entity,
                country=country_name,
            )
            continue
        if res.prop is not None:
            entity.add(res.prop, country_name)
        # h.audit_data(country_data)

    for rel_data in data.pop("related_persons", []):
        other_pep = rel_data.pop("is_pep", False)
        other_wdid = clean_wdid(rel_data.pop("person_wikidata_id"))
        other = context.make("Person", target=other_pep)
        other.id = person_id(context, rel_data.pop("person_id"), other_wdid)
        other.add("name", rel_data.pop("person_en", None))
        other.add("name", rel_data.pop("person_ru", None))
        other.add("wikidataId", other_wdid)

        rel_type = rel_data.pop("relationship_type_en", None)
        rel_type_ru = rel_data.pop("relationship_type_ru", None)
        rel_type = rel_type or rel_type_ru
        res = context.lookup("person_relations", rel_type)
        if res is None:
            context.log.warn(
                "Unknown person/person relation type",
                rel_type=rel_type,
                entity=entity,
                other=other,
            )
            continue

        # print("LINK", (entity.id, other.id))
        id_a, id_b = sorted((entity.id, other.id))
        rel = context.make(res.schema)
        id_a_short = short_id(context, id_a)
        id_b_short = short_id(context, id_b)
        rel.id = context.make_slug(id_a_short, res.schema, id_b_short)
        rel.add(res.from_prop, id_a)
        rel.add(res.to_prop, id_b)
        rel.add(res.desc_prop, rel_type)
        rel.add("modifiedAt", parse_date(rel_data.pop("date_confirmed")))
        rel.add("startDate", parse_date(rel_data.pop("date_established")))
        rel.add("endDate", parse_date(rel_data.pop("date_finished")))

        # h.audit_data(rel_data)
        context.emit(other, target=other_pep)
        context.emit(rel)

    data.pop("type_of_official_ru", None)
    person_type = data.pop("type_of_official_en", None)
    person_topic = context.lookup_value("person_type", person_type)
    if person_topic is None:
        context.log.warn("Unknown type of official", type=person_type)
    entity.add("topics", person_topic)
    if is_pep:
        entity.add("topics", "role.pep")
    entity.add("status", person_type)

    data.pop("died", None)
    data.pop("tags", None)
    data.pop("reason_of_termination_en", None)
    data.pop("reason_of_termination_ru", None)
    # TODO: store images
    data.pop("photo", None)
    data.pop("related_companies", None)
    data.pop("declarations", None)
    # h.audit_data(data)
    context.emit(entity, target=is_pep)
예제 #5
0
def crawl_company(context: Context, data: Dict[str, Any]):
    entity = context.make("Organization")
    entity.id = company_id(context, data.pop("id"))
    entity.add("sourceUrl", data.pop("url_en", None))
    data.pop("url_ru", None)
    entity.add("name", data.pop("name_en", None))
    entity.add("name", data.pop("name_ru", None))
    entity.add("name", data.pop("name_suggest_output_ru", None))
    entity.add("alias", data.pop("also_known_as", None))
    entity.add("alias", data.pop("short_name_en", None))
    entity.add("alias", data.pop("short_name_ru", None))
    entity.add("incorporationDate", parse_date(data.pop("founded", None)))
    entity.add("dissolutionDate", parse_date(data.pop("closed", None)))
    entity.add("status", data.pop("status_en", data.pop("status_ru", None)))
    entity.add("status", data.pop("status", None))
    entity.add_cast("Company", "ogrnCode", data.pop("ogrn_code", None))
    entity.add("registrationNumber", data.pop("edrpou", None))

    for country_data in data.pop("related_countries", []):
        rel_type = country_data.pop("relationship_type")
        country_name = country_data.pop("to_country_en", None)
        country_name = country_name or country_data.pop("to_country_ru")
        # print(country_name)
        res = context.lookup("country_links", rel_type)
        if res is None:
            context.log.warn(
                "Unknown country link",
                rel_type=rel_type,
                entity=entity,
                country=country_name,
            )
            continue
        if res.prop is not None:
            entity.add(res.prop, country_name)
        # h.audit_data(country_data)

    for rel_data in data.pop("related_persons", []):
        other_wdid = clean_wdid(rel_data.pop("person_wikidata_id"))
        other_id = person_id(context, rel_data.pop("person_id"), other_wdid)

        rel_type = rel_data.pop("relationship_type_en", None)
        rel_type_ru = rel_data.pop("relationship_type_ru", None)
        rel_type = rel_type or rel_type_ru
        res = context.lookup("person_relations", rel_type)
        if res is None:
            context.log.info(
                "Unknown company/person relation type",
                rel_type=rel_type,
                entity=entity,
                other=other_id,
            )
            continue

        if res.schema is None:
            continue

        if res.schema == "Organization" and res.from_prop == "asset":
            entity.schema = model.get("Company")

        rel = context.make(res.schema)
        id_a_short = short_id(context, entity.id)
        id_b_short = short_id(context, other_id)
        rel.id = context.make_slug(id_a_short, res.schema, id_b_short)
        rel.add(res.from_prop, entity.id)
        rel.add(res.to_prop, other_id)
        rel.add(res.desc_prop, rel_type)
        rel.add("modifiedAt", parse_date(rel_data.pop("date_confirmed")))
        rel.add("startDate", parse_date(rel_data.pop("date_established")))
        rel.add("endDate", parse_date(rel_data.pop("date_finished")))
        context.emit(rel)

    for rel_data in data.pop("related_companies", []):
        # pprint(rel_data)
        # other_id = company_id(context, rel_data.pop("company_id"))

        # rel_type = rel_data.pop("relationship_type_en", None)
        # rel_type_ru = rel_data.pop("relationship_type_ru", None)
        # rel_type = rel_type or rel_type_ru
        # res = context.lookup("company_relations", rel_type)
        # if res is None:
        #     context.log.warn(
        #         "Unknown company/company relation type",
        #         rel_type=rel_type,
        #         entity=entity,
        #         other=other_id,
        #     )
        #     continue

        # if res.schema is None:
        #     continue

        # if res.schema == "Organization" and res.from_prop == "asset":
        #     entity.schema = model.get("Company")

        # rel = context.make(res.schema)
        # id_a_short = short_id(context, entity.id)
        # id_b_short = short_id(context, other_id)
        # rel.id = context.make_slug(id_a_short, res.schema, id_b_short)
        # rel.add(res.from_prop, entity.id)
        # rel.add(res.to_prop, other_id)
        # rel.add(res.desc_prop, rel_type)
        # rel.add("modifiedAt", parse_date(rel_data.pop("date_confirmed")))
        # rel.add("startDate", parse_date(rel_data.pop("date_established")))
        # rel.add("endDate", parse_date(rel_data.pop("date_finished")))
        # context.emit(rel)
        pass

    address = h.make_address(
        context,
        street=data.pop("street", None),
        city=data.pop("city", None),
    )
    h.apply_address(context, entity, address)

    if data.pop("state_company", False):
        entity.add("topics", "gov.soe")

    ignore = [
        "wiki",
        "bank_name",
        "other_founders",
        "other_owners",
        "other_managers",
        "other_recipient",
    ]
    h.audit_data(data, ignore=ignore)
    # print(entity.to_dict())
    context.emit(entity)
예제 #6
0
def apply_prop(context: Context, entity, sanction, field, value):
    if field == "ALIAS":
        entity.add("alias", value.pop("Alias"))
    elif field == "SEXE":
        entity.add("gender", value.pop("Sexe"))
    elif field == "PRENOM":
        entity.add("firstName", value.pop("Prenom"))
    elif field == "NATIONALITE":
        entity.add("nationality", value.pop("Pays"))
    elif field == "TITRE":
        entity.add("position", value.pop("Titre"))
    elif field == "SITE_INTERNET":
        entity.add("website", value.pop("SiteInternet"))
    elif field == "TELEPHONE":
        entity.add("phone", value.pop("Telephone"))
    elif field == "COURRIEL":
        entity.add("email", value.pop("Courriel"))
    elif field == "NUMERO_OMI":
        entity.add("imoNumber", value.pop("NumeroOMI"))
    elif field == "DATE_DE_NAISSANCE":
        date = parse_parts(value.pop("Annee"), value.pop("Mois"),
                           value.pop("Jour"))
        entity.add("birthDate", date)
    elif field in ("ADRESSE_PM", "ADRESSE_PP"):
        address = h.make_address(
            context,
            full=value.pop("Adresse"),
            country=value.pop("Pays"),
        )
        h.apply_address(context, entity, address)
    elif field == "LIEU_DE_NAISSANCE":
        entity.add("birthPlace", value.pop("Lieu"))
        entity.add("country", value.pop("Pays"))
    elif field == "PASSEPORT":
        entity.add("passportNumber", value.pop("NumeroPasseport"))
    elif field == "IDENTIFICATION":
        comment = value.pop("Commentaire")
        content = value.pop("Identification")
        result = context.lookup("identification", comment)
        if result is None:
            context.log.warning(
                "Unknown Identification type",
                comment=comment,
                content=content,
            )
        elif result.prop is not None:
            schema = result.schema or entity.schema
            entity.add_cast(schema, result.prop, content)
            if result.prop == "notes":
                entity.add(result.prop, h.clean_note(comment))
    elif field == "AUTRE_IDENTITE":
        entity.add("idNumber", value.pop("NumeroCarte"))
    elif field == "REFERENCE_UE":
        sanction.add("authorityId", value.pop("ReferenceUe"))
    elif field == "REFERENCE_ONU":
        sanction.add("unscId", value.pop("ReferenceOnu"))
    elif field == "FONDEMENT_JURIDIQUE":
        sanction.add("program", value.pop("FondementJuridiqueLabel"))
        # TODO: derive target countries?
    elif field == "MOTIFS":
        motifs = value.pop("Motifs")
        sanction.add("reason", motifs)
        entity.add("notes", motifs)
    else:
        context.log.warning("Unknown field", field=field, value=value)