def parse_reference(context, reference, rows):
    entity = context.make("LegalEntity")
    entity.id = context.make_slug(reference)
    # entity.add("sourceUrl", context.dataset.url)
    sanction = h.make_sanction(context, entity)

    for row in rows:
        if row.pop("type") == "Individual":
            entity.schema = model.get("Person")

        name = row.pop("name_of_individual_or_entity", None)
        if row.pop("name_type") == "aka":
            entity.add("alias", name)
        else:
            entity.add("name", name)

        address = h.make_address(context, full=row.pop("address"))
        h.apply_address(context, entity, address)
        sanction.add("program", row.pop("committees"))
        citizen = multi_split(row.pop("citizenship"), ["a)", "b)", "c)", "d)"])
        entity.add("nationality", citizen, quiet=True)
        dates = clean_date(row.pop("date_of_birth"))
        entity.add("birthDate", dates, quiet=True)
        entity.add("birthPlace", row.pop("place_of_birth"), quiet=True)
        entity.add("notes", row.pop("additional_information"))
        entity.add("notes", row.pop("listing_information"), quiet=True)

        control_date = row.pop("control_date")
        sanction.add("modifiedAt", control_date)
        entity.add("modifiedAt", control_date)
        entity.context["updated_at"] = control_date.isoformat()

    entity.add("topics", "sanction")
    context.emit(entity, target=True)
    context.emit(sanction)
Exemplo n.º 2
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r") as fh:
        doc = html.parse(fh)

    for table in doc.findall("//table"):
        headers = table.findall("./thead/tr/td")
        headers = [h.text_content() for h in headers]
        assert "Vendor name" in headers, headers
        assert "From" in headers, headers
        for row in table.findall("./tbody/tr"):
            cells = [h.text_content() for h in row.findall("./td")]
            if len(cells[0]) == 0:
                continue
            entity = context.make("LegalEntity")
            entity.id = context.make_id(*cells)
            entity.add("name", cells[0])
            entity.add("country", cells[1])
            entity.add("topics", "crime.fraud")

            cc = entity.first("country")
            address = h.make_address(context, full=cells[2], country_code=cc)
            h.apply_address(context, entity, address)

            sanction = h.make_sanction(context, entity)
            sanction.add("reason", cells[3])
            sanction.add("program", cells[4])
            sanction.add("startDate", parse_date(cells[5]))
            sanction.add("endDate", parse_date(cells[6]))

            context.emit(sanction)
            context.emit(entity, target=True)
Exemplo n.º 3
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r") as fh:
        doc = html.parse(fh)
    table = doc.find('//div[@class="sanctioned-table"]/table')
    headers = None
    for row in table.findall(".//tr"):
        if headers is None:
            headers = [slugify(el.text) for el in row.findall("./th")]
            continue
        cells = [collapse_spaces(el.text) for el in row.findall("./td")]
        data = {hdr: c for hdr, c in zip(headers, cells)}

        entity = context.make("Person")
        entity.id = context.make_id(data["id"], data["ad-soyad-ata-adi"])
        entity.add("name", data["ad-soyad-ata-adi"])
        entity.add("idNumber", data["id"])
        entity.add("birthDate", parse_date(data["dogum-tarixi"]))
        entity.add("country", "az")
        entity.add("topics", "sanction")

        addr = h.make_address(context, full=data["malumat"])
        h.apply_address(context, entity, addr)

        sanction = h.make_sanction(context, entity)
        context.emit(sanction)
        context.emit(entity, target=True)
Exemplo n.º 4
0
def parse_row(context: Context, row):
    entity = context.make("LegalEntity")
    entity.id = context.make_slug(row.get("Effective_Date"), row.get("Name"))
    entity.add("name", row.get("Name"))
    entity.add("notes", row.get("Action"))
    entity.add("country", row.get("Country"))
    entity.add("modifiedAt", row.get("Last_Update"))

    address = h.make_address(
        context,
        street=row.get("Street_Address"),
        postal_code=row.get("Postal_Code"),
        city=row.get("City"),
        region=row.get("State"),
        country=row.get("Country"),
    )
    h.apply_address(context, entity, address)
    context.emit(entity, target=True)

    citation = row.get("FR_Citation")
    sanction = h.make_sanction(context, entity, key=citation)
    sanction.add("program", citation)
    sanction.add("startDate", h.parse_date(row.get("Effective_Date"), FORMATS))
    sanction.add("endDate", h.parse_date(row.get("Expiration_Date"), FORMATS))
    # pprint(row)
    context.emit(sanction)
Exemplo n.º 5
0
def crawl(context: Context):
    url = context.dataset.data.url
    headers = {"apikey": context.dataset.data.api_key}
    data = context.fetch_json(url, headers=headers)
    # TODO write this out to a source.json
    for data in data["response"]["ZPROCSUPP"]:
        # context.pprint(data)
        entity = context.make("LegalEntity")
        name = data.get("SUPP_NAME")
        ent_id = data.get("SUPP_ID")
        entity.id = context.make_slug(ent_id)
        names = clean_name(name)
        entity.add("name", names[0])
        entity.add("topics", "debarment")
        entity.add("country", data.get("COUNTRY_NAME"))
        for name in names[1:]:
            entity.add("alias", name)

        address = h.make_address(
            context,
            street=data.get("SUPP_ADDR"),
            city=data.get("SUPP_CITY"),
            country=data.get("COUNTRY_NAME"),
            key=entity.id,
        )
        h.apply_address(context, entity, address)

        sanction = h.make_sanction(context, entity)
        sanction.add("program", data.get("DEBAR_REASON"))
        sanction.add("startDate",
                     h.parse_date(data.get("DEBAR_FROM_DATE"), FORMATS))
        sanction.add("endDate", h.parse_date(data.get("DEBAR_TO_DATE"),
                                             FORMATS))
        context.emit(entity, target=True)
        context.emit(sanction)
Exemplo n.º 6
0
def apply_prop(context, entity, sanction, field, value):
    if field == "ALIAS":
        entity.add("alias", value.pop("Alias"))
    elif field == "SEXE":
        entity.add("gender", h.clean_gender(value.pop("Sexe")))
    elif field == "PRENOM":
        entity.add("firstName", value.pop("Prenom"))
    elif field == "NATIONALITE":
        entity.add("nationality", value.pop("Pays"))
    elif field == "TITRE":
        entity.add("position", value.pop("Titre"))
    elif field == "SITE_INTERNET":
        entity.add("website", value.pop("SiteInternet"))
    elif field == "TELEPHONE":
        entity.add("phone", value.pop("Telephone"))
    elif field == "COURRIEL":
        entity.add("email", value.pop("Courriel"))
    elif field == "NUMERO_OMI":
        entity.add("imoNumber", value.pop("NumeroOMI"))
    elif field == "DATE_DE_NAISSANCE":
        date = parse_parts(value.pop("Annee"), value.pop("Mois"),
                           value.pop("Jour"))
        entity.add("birthDate", date)
    elif field in ("ADRESSE_PM", "ADRESSE_PP"):
        address = h.make_address(
            context,
            full=value.pop("Adresse"),
            country=value.pop("Pays"),
        )
        h.apply_address(context, entity, address)
    elif field == "LIEU_DE_NAISSANCE":
        entity.add("birthPlace", value.pop("Lieu"))
        entity.add("country", value.pop("Pays"))
    elif field == "PASSEPORT":
        entity.add("passportNumber", value.pop("NumeroPasseport"))
    elif field == "IDENTIFICATION":
        comment = value.pop("Commentaire")
        content = value.pop("Identification")
        result = context.lookup("identification", comment)
        if result is None:
            context.log.warning(
                "Unknown Identification type",
                comment=comment,
                content=content,
            )
        else:
            schema = result.schema or entity.schema
            entity.add_cast(schema, result.prop, content)
            if result.prop == "notes":
                entity.add(result.prop, comment)
    elif field == "AUTRE_IDENTITE":
        entity.add("idNumber", value.pop("NumeroCarte"))
    elif field == "REFERENCE_UE":
        sanction.add("program", value.pop("ReferenceUe"))
    elif field == "REFERENCE_ONU":
        sanction.add("program", value.pop("ReferenceOnu"))
    elif field == "FONDEMENT_JURIDIQUE":
        sanction.add("reason", value.pop("FondementJuridiqueLabel"))
    elif field == "MOTIFS":
        sanction.add("reason", value.pop("Motifs"))
Exemplo n.º 7
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r", encoding="utf-8") as fh:
        doc = html.fromstring(fh.read())
    for table in doc.findall('.//div[@class="editor-content"]//table'):
        headers = None
        schema = None
        for row in table.findall(".//tr"):
            cells = [
                collapse_spaces(c.text_content()) for c in row.findall("./td")
            ]
            if headers is None:
                headers = [slugify(c, sep="_") for c in cells]
                continue
            if len(cells) == 1:
                schema = TYPES[cells[0]]
                continue
            row = dict(zip(headers, cells))

            entity = context.make(schema)
            name = row.pop("imie_i_nazwisko_nazwa_podmiotu")
            entity.id = context.make_slug(name)
            names = name.split("(")
            entity.add("name", names[0])
            for alias in names[1:]:
                entity.add("alias", alias.split(")")[0])
            notes = row.pop("uzasadnienie_wpisu_na_liste")
            entity.add("notes", notes)

            details = row.pop("dane_identyfikacyjne_osoby_podmiotu")
            for (chop, prop) in CHOPSKA:
                parts = details.rsplit(chop, 1)
                details = parts[0]
                if len(parts) > 1:
                    if prop == "address":
                        addr = h.make_address(context, full=parts[1])
                        h.apply_address(context, entity, addr)
                    else:
                        entity.add(prop, parts[1])
            if len(details.strip()):
                result = context.lookup("details", details)
                if result is None:
                    context.log.warning("Unhandled details", details=details)
                else:
                    for prop, value in result.props.items():
                        entity.add(prop, value)

            sanction = h.make_sanction(context, entity)
            provisions = row.pop("zastosowane_srodki_sankcyjne")
            sanction.add("provisions", provisions)

            start_date = row.pop("data_umieszczenia_na_liscie")
            start_date = start_date.replace(" r.", "")
            sanction.add("startDate", h.parse_date(start_date, ["%d.%m.%Y"]))

            h.audit_data(row)
            context.emit(entity, target=True)
            context.emit(sanction)
Exemplo n.º 8
0
def parse_reference(context: Context, reference: int, rows):
    schemata = set()
    for row in rows:
        type_ = row.pop("type")
        schema = context.lookup_value("type", type_)
        if schema is None:
            context.log.warning("Unknown entity type", type=type_)
            return
        schemata.add(schema)
    assert len(schemata) == 1, schemata
    entity = context.make(schemata.pop())

    primary_name = None
    for row in rows:
        name = row.pop("name_of_individual_or_entity", None)
        name_type = row.pop("name_type")
        name_prop = context.lookup_value("name_type", name_type)
        if name_prop is None:
            context.log.warning("Unknown name type", name_type=name_type)
            return
        entity.add(name_prop, name)
        if name_prop == "name":
            primary_name = name

    entity.id = context.make_slug(reference, primary_name)
    sanction = h.make_sanction(context, entity)

    primary_name = None
    for row in rows:
        addr = row.pop("address")
        if addr is not None:
            for part in multi_split(addr, SPLITS):
                address = h.make_address(context, full=part)
                h.apply_address(context, entity, address)
        sanction.add("program", row.pop("committees"))
        citizen = multi_split(row.pop("citizenship"), ["a)", "b)", "c)", "d)"])
        entity.add("nationality", citizen, quiet=True)
        dates = clean_date(row.pop("date_of_birth"))
        entity.add("birthDate", dates, quiet=True)
        entity.add("birthPlace", row.pop("place_of_birth"), quiet=True)
        entity.add("notes", h.clean_note(row.pop("additional_information")))
        listing_info = row.pop("listing_information")
        if isinstance(listing_info, datetime):
            entity.add("createdAt", listing_info)
            sanction.add("listingDate", listing_info)
        else:
            sanction.add("summary", listing_info)
        # TODO: consider parsing if it's not a datetime?

        control_date = row.pop("control_date")
        sanction.add("startDate", control_date)
        entity.add("createdAt", control_date)

    entity.add("topics", "sanction")
    context.emit(entity, target=True)
    context.emit(sanction)
Exemplo n.º 9
0
def parse_address(context: Context, data):
    return h.make_address(
        context,
        remarks=data.pop("NOTE", None),
        street=data.pop("STREET", None),
        city=data.pop("CITY", None),
        region=data.pop("STATE_PROVINCE", None),
        postal_code=data.pop("ZIP_CODE", None),
        country=data.pop("COUNTRY", None),
    )
Exemplo n.º 10
0
def parse_address(context: Context, node):
    return h.make_address(
        context,
        remarks=node.findtext("./NOTE"),
        street=node.findtext("./STREET"),
        city=node.findtext("./CITY"),
        region=node.findtext("./STATE_PROVINCE"),
        postal_code=node.findtext("./ZIP_CODE"),
        country=node.findtext("./COUNTRY"),
    )
Exemplo n.º 11
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r", encoding="ISO-8859-1") as fh:
        doc = html.parse(fh)

    table = doc.find("//div[@id='viewcontainer']/table")
    headers = None
    for row in table.findall(".//tr"):
        if headers is None:
            headers = [
                slugify(c.text_content(), "_") for c in row.findall("./th")
            ]
            continue
        cells = [
            collapse_spaces(c.text_content()) for c in row.findall("./td")
        ]
        cells = dict(zip(headers, cells))
        cells.pop(None, None)

        full_name = name = cells.pop("name")
        registration_number = None
        for splitter in REG_NRS:
            if splitter in name:
                name, registration_number = name.split(splitter, 1)
                registration_number = registration_number.replace(")", "")

        country = cells.pop("nationality")
        country = country.replace("Non ADB Member Country", "")
        country = country.replace("Rep. of", "")
        entity = context.make("LegalEntity")
        entity.id = context.make_id(full_name, country)
        entity.add("name", name)
        entity.add("alias", cells.pop("othername_logo"))
        entity.add("topics", "debarment")
        entity.add("country", country)
        entity.add("registrationNumber", registration_number)

        sanction = h.make_sanction(context, entity)
        sanction.add("reason", cells.pop("grounds"))
        sanction.add("program", cells.pop("sanction_type"))
        date_range = cells.pop("effect_date_lapse_date", "")
        if "|" in date_range:
            start_date, end_date = date_range.split("|")
            sanction.add("startDate", h.parse_date(start_date.strip(),
                                                   FORMATS))
            sanction.add("endDate", h.parse_date(end_date.strip(), FORMATS))

        address = h.make_address(context,
                                 full=cells.pop("address"),
                                 country=country)
        h.apply_address(context, entity, address)

        context.emit(entity, target=True)
        context.emit(sanction)
Exemplo n.º 12
0
def handle_address(context, entity, text):
    if text is None:
        return
    country = text
    if "," in country:
        country, _ = country.split(",", 1)
    code = registry.country.clean(country, fuzzy=True)
    if code is not None:
        entity.add("country", code)
    address = h.make_address(context, full=text, country_code=code)
    h.apply_address(context, entity, address)
Exemplo n.º 13
0
def crawl(context: Context):
    path = context.fetch_resource("source.json", context.dataset.data.url)
    context.export_resource(path, JSON, title=context.SOURCE_TITLE)
    with open(path, "r") as fh:
        data = json.load(fh)
    for record in data:
        bank = context.make("Company")
        charter_no = record.pop("CharterNumber")
        bank_name = record.pop("BankName")
        bank.id = context.make_slug(charter_no, bank_name)
        bank.add("name", bank_name)
        bank.add("registrationNumber", charter_no)
        bank.add("country", "us")
        bank.add("topics", "fin.bank")
        if bank.id is not None:
            context.emit(bank)
        company_name = record.pop("CompanyName")
        first_name = record.pop("FirstName")
        last_name = record.pop("LastName")
        if company_name:
            entity = context.make("Company")
            entity.id = context.make_id(charter_no, bank_name, company_name)
            entity.add("name", company_name)
        else:
            entity = context.make("Person")
            entity.id = context.make_id(charter_no, bank_name, first_name,
                                        last_name)
            h.apply_name(entity, first_name=first_name, last_name=last_name)
        entity.add("country", "us")
        entity.add("topics", "crime.fin")

        addr = h.make_address(
            context,
            city=record.pop("CityName"),
            state=record.pop("StateName"),
            country_code="us",
        )
        record.pop("StateAbbreviation")
        h.apply_address(context, entity, addr)

        sanction = h.make_sanction(context, entity)
        sanction.add("startDate", record.pop("CompleteDate", None))
        sanction.add("endDate", record.pop("TerminationDate", None))
        sanction.add("program", record.pop("EnforcementTypeDescription", None))
        sanction.add("authorityId", record.pop("DocketNumber", None))
        # context.pprint(record)
        context.emit(entity, target=True)
        context.emit(sanction)
Exemplo n.º 14
0
def parse_address(context: Context, el):
    country = el.get("countryDescription")
    if country == "UNKNOWN":
        country = None
    # context.log.info("Addrr", el=el)
    return h.make_address(
        context,
        street=el.get("street"),
        po_box=el.get("poBox"),
        city=el.get("city"),
        place=el.get("place"),
        postal_code=el.get("zipCode"),
        region=el.get("region"),
        country=country,
        country_code=parse_country(el),
    )
Exemplo n.º 15
0
def load_locations(context: Context, doc):
    locations = {}
    for location in doc.findall("./Locations/Location"):
        location_id = location.get("ID")

        countries = set()
        for area in location.findall("./LocationAreaCode"):
            area_code = ref_get("AreaCode", area.get("AreaCodeID"))
            countries.add(area_code.get("Description"))

        for country in location.findall("./LocationCountry"):
            country_obj = ref_get("Country", country.get("CountryID"))
            countries.add(country_obj.get("Value"))

        if len(countries) > 1:
            context.log.warn("Multiple countries", countries=countries)

        parts = {}
        for part in location.findall("./LocationPart"):
            type_ = ref_value("LocPartType", part.get("LocPartTypeID"))
            parts[type_] = part.findtext("./LocationPartValue/Value")

        country = first(countries)
        unknown = parts.get("Unknown")
        if registry.country.clean(unknown, fuzzy=True):
            country = unknown

        if country == "undetermined":
            country = unknown = None

        address = h.make_address(
            context,
            full=unknown,
            street=parts.get("ADDRESS1"),
            street2=parts.get("ADDRESS2"),
            street3=parts.get("ADDRESS3"),
            city=parts.get("CITY"),
            postal_code=parts.get("POSTAL CODE"),
            region=parts.get("REGION"),
            state=parts.get("STATE/PROVINCE"),
            country=country,
        )
        if address.id is not None:
            context.emit(address)
            locations[location_id] = address
    return locations
Exemplo n.º 16
0
def parse_russian_persons(context, entity, text):
    while "," in text:
        text, section = text.rsplit(",", 1)
        fragment = section.strip()
        if not len(fragment):
            continue
        date = parse_format(fragment, "%d.%m.%Y г.р.")
        if date.text is not None:
            entity.add("birthDate", date)
            continue
        if fragment.startswith("("):
            fragment = fragment.replace(")", "")
            entity.add("alias", fragment)
            continue

        obj = h.make_address(context, full=fragment, country_code="ru")
        h.apply_address(context, entity, obj)
    parse_name(entity, text)
Exemplo n.º 17
0
def compose_address(context: Context, entity, place, el):
    addr = dict(place)
    addr.update(parse_address(el))
    entity.add("country", addr.get("country"))
    po_box = addr.get("p-o-box")
    if po_box is not None:
        po_box = f"P.O. Box {po_box}"
    return h.make_address(
        context,
        remarks=addr.get("remarks"),
        summary=addr.get("co"),
        street=addr.get("address-details"),
        city=addr.get("location"),
        po_box=po_box,
        postal_code=addr.get("zip-code"),
        region=addr.get("area"),
        country=addr.get("country"),
    )
Exemplo n.º 18
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r") as fh:
        doc = html.parse(fh)

    table = doc.find(".//article//table")
    headers = None
    for row in table.findall(".//tr"):
        if headers is None:
            headers = [
                slugify(c.text_content(), "_") for c in row.findall("./td")
            ]
            headers = headers[:-2] + ["from", "to"] + headers[-1:]
            continue
        cells = [
            collapse_spaces(c.text_content()) for c in row.findall("./td")
        ]
        cells = dict(zip(headers, cells))
        if "prohibited_practice" not in cells:
            continue

        name = cells.pop("firm_name")
        nationality = cells.pop("nationality")
        entity = context.make("Company")
        entity.id = context.make_id(name, nationality)
        entity.add("name", name)
        entity.add("topics", "debarment")
        entity.add("country", nationality)

        sanction = h.make_sanction(context, entity)
        sanction.add("reason", cells.pop("prohibited_practice"))
        sanction.add("startDate", h.parse_date(cells.pop("from"), FORMATS))
        sanction.add("endDate", h.parse_date(cells.pop("to"), FORMATS))

        full = cells.pop("address")
        address = h.make_address(context, full=full, country=nationality)
        h.apply_address(context, entity, address)

        context.emit(entity, target=True)
        context.emit(sanction)
Exemplo n.º 19
0
def crawl_person(context: Context, name, url):
    context.log.debug("Crawling member", name=name, url=url)
    doc = context.fetch_html(url)
    _, person_id = url.rsplit("/", 1)
    person = context.make("Person")
    person.id = context.make_slug(person_id)
    person.add("sourceUrl", url)
    person.add("name", name)
    person.add("topics", "role.pep")

    last_name, first_name = name.split(", ", 1)
    person.add("firstName", first_name)
    person.add("lastName", last_name)

    address = {}
    details = doc.find('.//div[@class="regular-details"]')
    for row in details.findall('.//ul[@class="no-bullet"]/li'):
        children = row.getchildren()
        title = children[0]
        title_text = collapse_spaces(stringify(title.text_content()))
        title_text = title_text or title.get("class")
        value = collapse_spaces(title.tail)
        if title_text in ("Full name:", "Address:",
                          "Declaration of interests"):
            # ignore these.
            continue
        if title_text == "Emails:":
            emails = [e.text for e in row.findall(".//a")]
            person.add("email", emails)
            continue
        if "glyphicon-phone" in title_text:
            person.add("phone", value.split(","))
            continue
        if "fa-fax" in title_text:
            # TODO: yeah, no
            # person.add("phone", value)
            continue
        if title_text in ("Web sites:", "list-inline"):
            sites = [e.get("href") for e in row.findall(".//a")]
            person.add("website", sites)
            continue
        if title_text == "Represented Country:":
            person.add("country", value)
            continue
        if title_text == "Languages:":
            # TODO: missing in FtM
            # person.add("languages", value.split(','))
            continue
        if "Regions since:" in title_text:
            date = h.parse_date(value, FORMATS)
            person.add("createdAt", date)
            continue
        if "Date of birth:" in title_text:
            person.add("birthDate", h.parse_date(value, FORMATS))
            continue
        if "Commissions:" in title_text:
            for com in row.findall(".//li"):
                text = collapse_spaces(com.text_content())
                sep = "Mandate - "
                if sep in text:
                    _, text = text.split(sep, 1)
                person.add("sector", text)
            continue
        if "Areas of interest:" in title_text:
            for area in row.findall(".//li"):
                person.add("keywords", area.text_content())
            continue
        if title.tag == "i" and value is None:
            person.add("position", title_text)
            continue
        if title_text in ("Country:"):
            person.add("country", value)
        if title_text in ("Street:", "Postal code:", "City:", "Country:"):
            address[title_text.replace(":", "")] = value
            continue
        if title_text == "Political group:":
            group = context.make("Organization")
            group.add("name", value)
            slug = value
            if "(" in slug:
                _, slug = slug.rsplit("(", 1)
            slug = slugify(slug, sep="-")
            group.id = f"eu-cor-group-{slug}"
            context.emit(group)
            member = context.make("Membership")
            member.id = context.make_id("Membership", person.id, group.id)
            member.add("member", person)
            member.add("organization", group)
            context.emit(member)
            continue

    address = h.make_address(
        context,
        street=address.get("Street"),
        city=address.get("City"),
        postal_code=address.get("Posal code"),
        country=address.get("Country"),
    )
    h.apply_address(context, person, address)
    context.emit(person, target=True)
Exemplo n.º 20
0
def crawl_item(context: Context, listing):
    links = listing.get("links", {})
    url = urljoin(API_URL, links.get("self"))
    data = http_get(context, url, cache_days=14)
    person = context.make("Person")
    _, officer_id = url.rsplit("/", 1)
    person.id = context.make_slug(officer_id)

    person.add("name", listing.get("title"))
    person.add("notes", listing.get("description"))
    person.add("topics", "crime")
    source_url = urljoin(WEB_URL, links.get("self"))
    person.add("sourceUrl", source_url)

    last_name = data.pop("surname", None)
    person.add("lastName", last_name)
    forename = data.pop("forename", None)
    person.add("firstName", forename)
    other_forenames = data.pop("other_forenames", None)
    person.add("middleName", other_forenames)
    person.add("title", data.pop("title", None))

    nationality = data.pop("nationality", None)
    if nationality is not None:
        person.add("nationality", nationality.split(","))
    person.add("birthDate", data.pop("date_of_birth", None))
    person.add("topics", "crime")

    address = listing.get("address", {})
    address = h.make_address(
        context,
        full=listing.get("address_snippet"),
        street=address.get("address_line_1"),
        street2=address.get("premises"),
        city=address.get("locality"),
        postal_code=address.get("postal_code"),
        region=address.get("region"),
        # country_code=person.first("nationality"),
    )
    h.apply_address(context, person, address)

    for disqual in data.pop("disqualifications", []):
        case_id = disqual.get("case_identifier")
        sanction = h.make_sanction(context, person, key=case_id)
        sanction.add("recordId", case_id)
        sanction.add("startDate", disqual.get("disqualified_from"))
        sanction.add("endDate", disqual.get("disqualified_until"))
        sanction.add("listingDate", disqual.get("undertaken_on"))
        for key, value in disqual.get("reason", {}).items():
            value = value.replace("-", " ")
            reason = f"{key}: {value}"
            sanction.add("reason", reason)
        sanction.add("country", "gb")
        context.emit(sanction)

        address = disqual.get("address", {})
        address = h.make_address(
            context,
            full=listing.get("address_snippet"),
            street=address.get("address_line_1"),
            street2=address.get("premises"),
            city=address.get("locality"),
            postal_code=address.get("postal_code"),
            region=address.get("region"),
            # country_code=person.first("nationality"),
        )

        for company_name in disqual.get("company_names", []):
            company = context.make("Company")
            company.id = context.make_slug("named", company_name)
            company.add("name", company_name)
            company.add("jurisdiction", "gb")
            context.emit(company)
            h.apply_address(context, company, address)

            directorship = context.make("Directorship")
            directorship.id = context.make_id(person.id, company.id)
            directorship.add("director", person)
            directorship.add("organization", company)
            context.emit(directorship)

    context.emit(person, target=True)
Exemplo n.º 21
0
def parse_entry(context: Context, entry):
    entity = context.make("LegalEntity")
    if entry.findtext("./type-entry") == "2":
        entity = context.make("Person")
    entry_id = entry.findtext("number-entry")
    entity.id = context.make_slug(entry_id)

    sanction = h.make_sanction(context, entity)
    sanction.add("program", entry.findtext("./program-entry"))
    date_entry = entry.findtext("./date-entry")
    if date_entry:
        date = datetime.strptime(date_entry, "%Y%m%d")
        entity.add("createdAt", date.date())
        sanction.add("listingDate", date.date())
        sanction.add("startDate", date.date())

    for aka in entry.findall("./aka-list"):
        h.apply_name(
            entity,
            name1=aka.findtext("./aka-name1"),
            name2=aka.findtext("./aka-name2"),
            name3=aka.findtext("./aka-name3"),
            tail_name=aka.findtext("./aka-name4"),
            alias=aka.findtext("type-aka") != "N",
            is_weak=aka.findtext("./quality-aka") == "2",
            quiet=True,
        )

    for node in entry.findall("./title-list"):
        entity.add("title", node.text, quiet=True)

    for doc in entry.findall("./document-list"):
        reg = doc.findtext("./document-reg")
        number = doc.findtext("./document-id")
        country = doc.findtext("./document-country")
        passport = context.make("Passport")
        passport.id = context.make_id("Passport", entity.id, reg, number, country)
        passport.add("holder", entity)
        passport.add("passportNumber", number)
        passport.add("summary", reg)
        passport.add("country", country)
        context.emit(passport)

    for doc in entry.findall("./id-number-list"):
        entity.add("idNumber", doc.text)

    for node in entry.findall("./address-list"):
        address = h.make_address(context, full=node.findtext("./address"))
        h.apply_address(context, entity, address)

    for pob in entry.findall("./place-of-birth-list"):
        entity.add_cast("Person", "birthPlace", pob.text)

    for dob in entry.findall("./date-of-birth-list"):
        date = parse_date(dob.text)
        entity.add_cast("Person", "birthDate", date)

    for nat in entry.findall("./nationality-list"):
        for country in multi_split(nat.text, [";", ","]):
            country = remove_bracketed(country)
            entity.add("nationality", country, quiet=True)

    entity.add("topics", "sanction")
    context.emit(entity, target=True)
    context.emit(sanction)
Exemplo n.º 22
0
def parse_row(context: Context, row):
    group_type = row.pop("GroupTypeDescription")
    schema = TYPES.get(group_type)
    if schema is None:
        context.log.error("Unknown group type", group_type=group_type)
        return
    entity = context.make(schema)
    entity.id = context.make_slug(row.pop("GroupID"))
    sanction = h.make_sanction(context, entity)
    sanction.add("program", row.pop("RegimeName"))
    sanction.add("authority", row.pop("ListingType", None))
    listed_date = h.parse_date(row.pop("DateListed"), FORMATS)
    sanction.add("listingDate", listed_date)
    designated_date = h.parse_date(row.pop("DateDesignated"), FORMATS)
    sanction.add("startDate", designated_date)

    entity.add("createdAt", listed_date)
    if not entity.has("createdAt"):
        entity.add("createdAt", designated_date)

    sanction.add("authorityId", row.pop("UKSanctionsListRef", None))
    sanction.add("unscId", row.pop("UNRef", None))
    sanction.add("status", row.pop("GroupStatus", None))
    sanction.add("reason", row.pop("UKStatementOfReasons", None))

    last_updated = h.parse_date(row.pop("LastUpdated"), FORMATS)
    sanction.add("modifiedAt", last_updated)
    entity.add("modifiedAt", last_updated)

    # TODO: derive topics and schema from this??
    entity_type = row.pop("Entity_Type", None)
    entity.add_cast("LegalEntity", "legalForm", entity_type)

    reg_number = row.pop("Entity_BusinessRegNumber", None)
    entity.add_cast("LegalEntity", "registrationNumber", reg_number)

    row.pop("Ship_Length", None)
    entity.add_cast("Vessel", "flag", row.pop("Ship_Flag", None))
    flags = split_new(row.pop("Ship_PreviousFlags", None))
    entity.add_cast("Vessel", "pastFlags", flags)
    entity.add_cast("Vessel", "type", row.pop("Ship_Type", None))
    entity.add_cast("Vessel", "tonnage", row.pop("Ship_Tonnage", None))
    entity.add_cast("Vessel", "buildDate", row.pop("Ship_YearBuilt", None))
    entity.add_cast("Vessel", "imoNumber", row.pop("Ship_IMONumber", None))

    ship_owner = row.pop("Ship_CurrentOwners", None)
    if ship_owner is not None:
        owner = context.make("LegalEntity")
        owner.id = context.make_slug("named", ship_owner)
        owner.add("name", ship_owner)
        context.emit(owner)

        ownership = context.make("Ownership")
        ownership.id = context.make_id(entity.id, "owns", owner.id)
        ownership.add("owner", owner)
        ownership.add("asset", entity)
        context.emit(ownership)

    countries = parse_countries(row.pop("Country", None))
    entity.add("country", countries)

    title = split_items(row.pop("Title", None))
    entity.add("title", title, quiet=True)

    pobs = split_items(row.pop("Individual_TownOfBirth", None))
    entity.add_cast("Person", "birthPlace", pobs)

    dob = h.parse_date(row.pop("Individual_DateOfBirth", None), FORMATS)
    entity.add_cast("Person", "birthDate", dob)

    cob = parse_countries(row.pop("Individual_CountryOfBirth", None))
    entity.add_cast("Person", "country", cob)

    nationalities = parse_countries(row.pop("Individual_Nationality", None))
    entity.add_cast("Person", "nationality", nationalities)

    positions = split_items(row.pop("Individual_Position", None))
    entity.add_cast("Person", "position", positions)

    entity.add_cast("Person", "gender", row.pop("Individual_Gender", None))

    name_type = row.pop("AliasType", None)
    name_prop = NAME_TYPES.get(name_type)
    if name_prop is None:
        context.log.warning("Unknown name type", type=name_type)
        return
    name_quality = row.pop("AliasQuality", None)
    is_weak = WEAK_QUALITY.get(name_quality)
    if is_weak is None:
        context.log.warning("Unknown name quality", quality=name_quality)
        return

    h.apply_name(
        entity,
        name1=row.pop("name1", None),
        name2=row.pop("name2", None),
        name3=row.pop("name3", None),
        name4=row.pop("name4", None),
        name5=row.pop("name5", None),
        tail_name=row.pop("Name6", None),
        name_prop=name_prop,
        is_weak=is_weak,
        quiet=True,
    )
    entity.add("alias", row.pop("NameNonLatinScript", None))

    full_address = join_text(
        row.pop("Address1", None),
        row.pop("Address2", None),
        row.pop("Address3", None),
        row.pop("Address4", None),
        row.pop("Address5", None),
        row.pop("Address6", None),
        sep=", ",
    )

    address = h.make_address(
        context,
        full=full_address,
        postal_code=row.pop("PostCode", None),
        country=first(countries),
    )
    h.apply_address(context, entity, address)

    passport_number = row.pop("Individual_PassportNumber", None)
    passport_numbers = split_items(passport_number)
    entity.add_cast("Person", "passportNumber", passport_numbers)
    passport_detail = row.pop("Individual_PassportDetails", None)
    # passport_details = split_items(passport_detail)
    # TODO: where do I stuff this?

    ni_number = row.pop("Individual_NINumber", None)
    ni_numbers = split_items(ni_number)
    entity.add_cast("Person", "idNumber", ni_numbers)
    ni_detail = row.pop("Individual_NIDetails", None)
    # ni_details = split_items(ni_detail)
    # TODO: where do I stuff this?

    for phone in split_new(row.pop("PhoneNumber", None)):
        entity.add_cast("LegalEntity", "phone", phone)

    for email in split_new(row.pop("EmailAddress", None)):
        entity.add_cast("LegalEntity", "email", email)

    for website in split_new(row.pop("Website", None)):
        entity.add_cast("LegalEntity", "website", website)

    for name in parse_companies(context, row.pop("Entity_ParentCompany",
                                                 None)):
        parent = context.make("Organization")
        parent.id = context.make_slug("named", name)
        parent.add("name", name)
        context.emit(parent)

        ownership = context.make("Ownership")
        ownership.id = context.make_id(entity.id, "owns", parent.id)
        ownership.add("owner", parent)
        ownership.add("asset", entity)
        context.emit(ownership)

    for name in parse_companies(context, row.pop("Entity_Subsidiaries", None)):
        subsidiary = context.make("Company")
        subsidiary.id = context.make_slug("named", name)
        subsidiary.add("name", name)
        context.emit(subsidiary)

        ownership = context.make("Ownership")
        ownership.id = context.make_id(entity.id, "owns", subsidiary.id)
        ownership.add("owner", entity)
        ownership.add("asset", subsidiary)
        context.emit(ownership)

    grp_status = row.pop("GrpStatus", None)
    if grp_status != "A":
        context.log.warning("Unknown GrpStatus", value=grp_status)

    entity.add("notes", h.clean_note(row.pop("OtherInformation", None)))
    h.audit_data(row, ignore=["NonLatinScriptLanguage", "NonLatinScriptType"])

    entity.add("topics", "sanction")
    context.emit(entity, target=True)
    context.emit(sanction)
Exemplo n.º 23
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r") as fh:
        doc = html.parse(fh)
    tables = doc.findall(".//table")
    assert len(tables) == 1
    rows = tables[0].findall(".//tr")
    for row in rows[2:]:
        cells = [
            collapse_spaces(c.text_content()) for c in row.findall("./td")
        ]
        index = cells[0]
        body = cells[1]
        decision = cells[2]
        un_id = cells[3]
        listing_date = cells[4]
        entity = context.make("Thing")
        entity.id = context.make_slug(index, un_id)
        entity.add("notes", cells[5])

        sanction = h.make_sanction(context, entity)
        sanction.add("listingDate", clean_date(listing_date))
        sanction.add("program", decision)
        sanction.add("recordId", un_id)

        body, gender = maybe_rsplit(body, "пол:")
        entity.add_cast("Person", "gender", h.clean_gender(gender))
        body, gender = maybe_rsplit(body, "Пол:")
        entity.add_cast("Person", "gender", h.clean_gender(gender))
        body, location = maybe_rsplit(body, "местонахождение:")
        entity.add_cast("LegalEntity", "country", location)
        body, imo_num = maybe_rsplit(body, "Присвоенный ИМО номер компании:")
        body, imo_num = maybe_rsplit(body, "Номер ИМО:")
        body, emails = maybe_rsplit(body, "Адрес эл. почты:")
        for email in letter_split(emails):
            entity.add_cast("LegalEntity", "email", email)
        body, fax = maybe_rsplit(body, "Номер факса:")
        body, fax = maybe_rsplit(body, "Факс:")
        body, phones = maybe_rsplit(body, "Номера телефонов:")
        for phone in letter_split(phones):
            entity.add_cast("LegalEntity", "phone", phone)
        body, phones = maybe_rsplit(body, "Тел.:")
        for phone in letter_split(phones):
            entity.add_cast("LegalEntity", "phone", phone)
        body, swift = maybe_rsplit(body, "СВИФТ-код:")
        entity.add_cast("LegalEntity", "swiftBic", swift)
        body, swift = maybe_rsplit(body, "СВИФТ/БИК-код:")
        entity.add_cast("LegalEntity", "swiftBic", swift)

        body, other_info = maybe_rsplit(body, "Прочая информация:")
        entity.add_cast("Thing", "notes", other_info)
        body, listing_date = maybe_rsplit(body, "Дата внесения в перечень:")
        body, addresses = maybe_rsplit(body, "Адрес:")
        for address in letter_split(addresses):
            country = address
            if ", " in country:
                country = address.rsplit(", ", 1)
            code = registry.country.clean(country, fuzzy=True)
            obj = h.make_address(context, full=address, country_code=code)
            h.apply_address(context, entity, obj)
            entity.add("country", code)
        body, national_ids = maybe_rsplit(
            body, "Национальный идентификационный номер:")
        for national_id in letter_split(national_ids):
            entity.add_cast("LegalEntity", "idNumber", national_id)
        body, passport_nos = maybe_rsplit(body, "Паспорт №:")
        for passport_no in letter_split(passport_nos):
            entity.add_cast("Person", "passportNumber", passport_no)
        body, citizenship = maybe_rsplit(body, "Гражданство:")
        entity.add_cast("Person", "nationality", citizenship)
        aka = "На основании менее достоверных источников также известен как:"
        body, aka = maybe_rsplit(body, aka)
        entity.add("alias", letter_split(aka))
        strong_aka = "На основании достоверных источников также известен как:"
        body, strong_aka = maybe_rsplit(body, strong_aka)
        entity.add("alias", letter_split(strong_aka))
        body, rik_no = maybe_rsplit(body, "Р.И.К.:")

        body, birth_place = maybe_rsplit(body, "Место рождения:")
        entity.add_cast("Person", "birthPlace", birth_place)
        body, birth_dates = maybe_rsplit(body, "Дата рождения:")
        for birth_date in letter_split(birth_dates):
            entity.add_cast("Person", "birthDate", clean_date(birth_date))
        body, position = maybe_rsplit(body, "Должность:")
        entity.add_cast("Person", "position", position)
        body, job = maybe_rsplit(body, "Обращение:")
        entity.add_cast("Person", "position", job)

        body, aliases = maybe_rsplit(body, "Другие названия:")
        entity.add("alias", letter_split(aliases))

        body, aliases = maybe_rsplit(body, "Вымышленные названия:")
        entity.add("alias", letter_split(aliases))

        names = body.split(", ")
        entity.add("name", names)
        # context.pprint(names)

        if entity.schema.name == "Thing":
            entity.schema = model.get("LegalEntity")

        context.emit(entity, target=True)
        context.emit(sanction)
Exemplo n.º 24
0
def emit_row(context: Context, sheet: str, section: str, row: Dict[str,
                                                                   List[str]]):
    schema = context.lookup_value("schema", section)
    if schema is None:
        context.log.warning("No schema for section",
                            section=section,
                            sheet=sheet)
        return
    entity = context.make(schema)
    entity.id = context.make_id(*row.get("name_english"),
                                *row.get("name_japanese"))
    if entity.id is None:
        # context.pprint((sheet, row))
        return
    entity.add("name", parse_names(row.pop("name_english")))
    if not entity.has("name"):
        entity.add("name", parse_names(row.pop("name_japanese")))
    else:
        entity.add("alias", parse_names(row.pop("name_japanese")))

    entity.add("alias", parse_names(row.pop("alias", [])))
    entity.add("alias", parse_names(row.pop("known_alias", [])))
    entity.add("weakAlias", parse_names(row.pop("weak_alias", [])))
    entity.add("weakAlias", parse_names(row.pop("nickname", [])))
    entity.add("previousName", parse_names(row.pop("past_alias", [])))
    entity.add("previousName", parse_names(row.pop("old_name", [])))
    entity.add_cast("Person", "position", row.pop("position", []))
    birth_date = parse_date(row.pop("birth_date", []))
    entity.add_cast("Person", "birthDate", birth_date)
    entity.add_cast("Person", "birthPlace", row.pop("birth_place", []))
    entity.add_cast("Person", "passportNumber", row.pop("passport_number", []))
    entity.add("idNumber", row.pop("id_number", []))
    entity.add("idNumber", row.pop("identification_number", []))
    entity.add("notes", row.pop("other_information", []))
    entity.add("notes", row.pop("details", []))
    entity.add("phone", row.pop("phone", []))
    entity.add("phone", row.pop("fax", []))

    for address_full in row.pop("address", []):
        address = h.make_address(context, full=address_full)
        h.apply_address(context, entity, address)

    for address_full in row.pop("where", []):
        address = h.make_address(context, full=address_full)
        h.apply_address(context, entity, address)

    title = row.pop("title", [])
    if entity.schema.is_a("Person"):
        entity.add("title", title)
    else:
        entity.add("notes", title)
    entity.add("country", row.pop("citizenship", []))
    entity.add("country", row.pop("activity_area", []))

    sanction = h.make_sanction(context, entity)
    sanction.add("program", section)
    sanction.add("reason", row.pop("root_nomination", None))
    sanction.add("reason", row.pop("reason_res1483", None))
    sanction.add("recordId", row.pop("notification_number", None))

    sanction.add("startDate", parse_date(row.pop("notification_date", [])))
    sanction.add("startDate", parse_date(row.pop("designated_date", [])))
    sanction.add("listingDate", parse_date(row.pop("publication_date", [])))

    row.pop("designated_un", None)
    # if len(row):
    #     context.pprint(row)
    entity.add("topics", "sanction")
    context.emit(entity, target=True)
    context.emit(sanction)
Exemplo n.º 25
0
def parse_row(context, row):
    group_type = row.pop("GroupTypeDescription")
    org_type = row.pop("OrgType", None)
    if group_type == "Individual":
        base_schema = "Person"
    elif row.get("TypeOfVessel") is not None:
        base_schema = "Vessel"
    elif group_type == "Entity":
        base_schema = context.lookup_value("org_type", org_type,
                                           "Organization")
    else:
        context.log.error("Unknown entity type", group_type=group_type)
        return
    entity = context.make(base_schema)
    entity.id = context.make_slug(row.pop("GroupID"))
    if org_type is not None:
        org_types = split_items(org_type)
        entity.add_cast("LegalEntity", "legalForm", org_types)

    sanction = h.make_sanction(context, entity)
    # entity.add("position", row.pop("Position"), quiet=True)
    entity.add("notes", row.pop("OtherInformation", None), quiet=True)
    entity.add("notes",
               row.pop("FurtherIdentifiyingInformation", None),
               quiet=True)

    sanction.add("program", row.pop("RegimeName"))
    sanction.add("authority", row.pop("ListingType", None))
    sanction.add("startDate", h.parse_date(row.pop("DateListed"), FORMATS))
    sanction.add("recordId", row.pop("FCOId", None))
    sanction.add("status", row.pop("GroupStatus", None))
    sanction.add("reason", row.pop("UKStatementOfReasons", None))

    last_updated = h.parse_date(row.pop("LastUpdated"), FORMATS)
    if last_updated is not None:
        sanction.add("modifiedAt", last_updated)
        sanction.context["updated_at"] = last_updated
        entity.add("modifiedAt", last_updated)
        entity.context["updated_at"] = last_updated

    # DoB is sometimes a year only
    row.pop("DateOfBirth", None)
    dob = parse_parts(
        row.pop("YearOfBirth", 0),
        row.pop("MonthOfBirth", 0),
        row.pop("DayOfBirth", 0),
    )
    entity.add_cast("Person", "birthDate", dob)

    gender = h.clean_gender(row.pop("Gender", None))
    entity.add_cast("Person", "gender", gender)
    id_number = row.pop("NationalIdNumber", None)
    entity.add_cast("LegalEntity", "idNumber", split_items(id_number))
    passport = row.pop("PassportDetails", None)
    entity.add_cast("Person", "passportNumber", split_items(passport))

    flag = row.pop("FlagOfVessel", None)
    entity.add_cast("Vessel", "flag", flag)

    prev_flag = row.pop("PreviousFlags", None)
    entity.add_cast("Vessel", "pastFlags", prev_flag)

    year = row.pop("YearBuilt", None)
    entity.add_cast("Vehicle", "buildDate", year)

    type_ = row.pop("TypeOfVessel", None)
    entity.add_cast("Vehicle", "type", type_)

    imo = row.pop("IMONumber", None)
    entity.add_cast("Vessel", "imoNumber", imo)

    tonnage = row.pop("TonnageOfVessel", None)
    entity.add_cast("Vessel", "tonnage", tonnage)
    row.pop("LengthOfVessel", None)

    # entity.add("legalForm", org_type)
    title = split_items(row.pop("NameTitle", None))
    entity.add("title", title, quiet=True)
    entity.add("firstName", row.pop("name1", None), quiet=True)
    entity.add("secondName", row.pop("name2", None), quiet=True)
    entity.add("middleName", row.pop("name3", None), quiet=True)
    entity.add("middleName", row.pop("name4", None), quiet=True)
    entity.add("middleName", row.pop("name5", None), quiet=True)
    name6 = row.pop("Name6", None)
    entity.add("lastName", name6, quiet=True)
    full_name = row.pop("FullName", name6)
    row.pop("AliasTypeName")
    if row.pop("AliasType") == "AKA":
        entity.add("alias", full_name)
    else:
        entity.add("name", full_name)

    nationalities = parse_countries(row.pop("Nationality", None))
    entity.add("nationality", nationalities, quiet=True)
    position = split_items(row.pop("Position", None))
    entity.add("position", position, quiet=True)

    birth_countries = parse_countries(row.pop("CountryOfBirth", None))
    entity.add("country", birth_countries, quiet=True)

    countries = parse_countries(row.pop("Country", None))
    entity.add("country", countries)
    pob = split_items(row.pop("TownOfBirth", None))
    entity.add("birthPlace", pob, quiet=True)

    address = h.make_address(
        context,
        full=row.pop("FullAddress", None),
        street=row.pop("address1", None),
        street2=row.pop("address2", None),
        street3=row.pop("address3", None),
        city=row.pop("address4", None),
        place=row.pop("address5", None),
        region=row.pop("address6", None),
        postal_code=row.pop("PostCode", None),
        country=first(countries),
    )
    h.apply_address(context, entity, address)

    reg_number = row.pop("BusinessRegNumber", None)
    entity.add_cast("LegalEntity", "registrationNumber", reg_number)

    phones = split_items(row.pop("PhoneNumber", None), comma=True)
    phones = h.clean_phones(phones)
    entity.add_cast("LegalEntity", "phone", phones)

    website = split_items(row.pop("Website", None), comma=True)
    entity.add_cast("LegalEntity", "website", website)

    emails = split_items(row.pop("EmailAddress", None), comma=True)
    emails = h.clean_emails(emails)
    entity.add_cast("LegalEntity", "email", emails)

    # TODO: graph
    row.pop("Subsidiaries", None)
    row.pop("ParentCompany", None)
    row.pop("CurrentOwners", None)

    row.pop("DateListedDay", None)
    row.pop("DateListedMonth", None)
    row.pop("DateListedYear", None)
    row.pop("LastUpdatedDay", None)
    row.pop("LastUpdatedMonth", None)
    row.pop("LastUpdatedYear", None)
    row.pop("GrpStatus", None)
    row.pop("ID", None)
    row.pop("DateOfBirthId", None)
    row.pop("DateListedDay", None)
    if len(row):
        pprint(row)

    entity.add("topics", "sanction")
    context.emit(entity, target=True, unique=True)
    context.emit(sanction)
Exemplo n.º 26
0
def parse_entry(context, entry):
    entity = context.make("LegalEntity")
    if entry.findtext("./type-entry") == "2":
        entity = context.make("Person")
    entry_id = entry.findtext("number-entry")
    entity.id = context.make_slug(entry_id)

    sanction = h.make_sanction(context, entity)
    sanction.add("program", entry.findtext("./program-entry"))
    date_entry = entry.findtext("./date-entry")
    if date_entry:
        date = datetime.strptime(date_entry, "%Y%m%d")
        entity.context["created_at"] = date.isoformat()
        sanction.add("startDate", date.date())

    for aka in entry.findall("./aka-list"):
        first_name = aka.findtext("./aka-name1")
        entity.add("firstName", first_name, quiet=True)
        second_name = aka.findtext("./aka-name2")
        entity.add("secondName", second_name, quiet=True)
        third_name = aka.findtext("./aka-name3")
        entity.add("middleName", third_name, quiet=True)
        last_name = aka.findtext("./aka-name4")
        entity.add("lastName", last_name, quiet=True)
        name = jointext(first_name, second_name, third_name, last_name)
        if aka.findtext("type-aka") == "N":
            entity.add("name", name)
        else:
            if aka.findtext("./quality-aka") == "2":
                entity.add("weakAlias", name)
            else:
                entity.add("alias", name)

    for node in entry.findall("./title-list"):
        entity.add("title", node.text, quiet=True)

    for doc in entry.findall("./document-list"):
        reg = doc.findtext("./document-reg")
        number = doc.findtext("./document-id")
        country = doc.findtext("./document-country")
        passport = context.make("Passport")
        passport.id = context.make_id("Passport", entity.id, reg, number,
                                      country)
        passport.add("holder", entity)
        passport.add("passportNumber", number)
        passport.add("summary", reg)
        passport.add("country", country)
        context.emit(passport)

    for doc in entry.findall("./id-number-list"):
        entity.add("idNumber", doc.text)

    for node in entry.findall("./address-list"):
        address = h.make_address(context, full=node.findtext("./address"))
        h.apply_address(context, entity, address)

    for pob in entry.findall("./place-of-birth-list"):
        entity.add_cast("Person", "birthPlace", pob.text)

    for dob in entry.findall("./date-of-birth-list"):
        date = parse_date(dob.text)
        entity.add_cast("Person", "birthDate", date)

    for nat in entry.findall("./nationality-list"):
        for country in multi_split(nat.text, [";", ","]):
            country = remove_bracketed(country)
            entity.add("nationality", country, quiet=True)

    entity.add("topics", "sanction")
    context.emit(entity, target=True, unique=True)
    context.emit(sanction)
Exemplo n.º 27
0
def crawl_organizations(context: Context):
    path = context.fetch_resource("organizations.xlsx", ORG_URL)
    context.export_resource(path, XLSX, title=context.SOURCE_TITLE)
    seq_ids = {}
    links = []
    for record in excel_records(path):
        seq_id = record.pop("internal_seq_id", None)
        name_en = record.pop("organization_name_english", None)
        name_he = record.pop("organization_name_hebrew", None)
        entity = context.make("Organization")
        entity.id = context.make_id(name_en, name_he)
        if entity.id is None:
            continue
        if seq_id is not None:
            seq_ids[seq_id] = entity.id
        entity.add("name", name_en)
        entity.add("name", name_he)
        entity.add("topics", "crime.terror")
        entity.add("notes", lang_pick(record, "comments"))
        entity.add("notes", record.pop("column_42", None))
        entity.add("email", record.pop("email", None))
        entity.add("country", record.pop("country_hebrew", None))
        entity.add("country", record.pop("country_english", None))
        entity.add("registrationNumber", record.pop("corporation_id", None))
        entity.add("legalForm", lang_pick(record, "corporation_type"))
        entity.add("jurisdiction", lang_pick(record, "location_of_formation"))
        date = parse_date(record.pop("date_of_corporation", None))
        entity.add("incorporationDate", date)
        for field in list(record.keys()):
            if field.startswith("organization_name_"):
                entity.add("alias", record.pop(field, None))
            if field.startswith("telephone"):
                entity.add("phone", record.pop(field, None))
            if field.startswith("website"):
                entity.add("website", record.pop(field, None))

        entity.add("phone", record.pop("column_70", None))
        entity.add("website", record.pop("column_73", None))

        sanction = h.make_sanction(context, entity)
        sanction.add("recordId", seq_id)
        sanction.add("recordId", record.pop("seq_num_in_other_countries", None))
        sanction.add("program", record.pop("designation_type", None))
        sanction.add("reason", lang_pick(record, "designation_justification"))
        sanction.add("authority", lang_pick(record, "designated_by"))
        sanction.add("publisher", record.pop("public_records_references", None))

        lang_pick(record, "designated_by_abroad")
        record.pop("date_designated_in_other_countries", None)

        linked = record.pop("linked_to_internal_seq_id", "")
        for link in linked.split(";"):
            links.append((max(link, seq_id), min(link, seq_id)))

        street = lang_pick(record, "street")
        city = lang_pick(record, "city_village")
        if street or city:
            address = h.make_address(
                context, street=street, city=city, country_code=entity.first("country")
            )
            h.apply_address(context, entity, address)

        for field in (
            "date_of_temporary_designation",
            "date_of_permenant_designation",
            "date_designation_in_west_bank",
        ):
            sanction.add("startDate", parse_date(record.pop(field, None)))

        context.emit(entity, target=True)
        context.emit(sanction)
        if len(record):
            context.pprint(record)

    for (subject, object) in links:
        subject_id = seq_ids.get(subject)
        object_id = seq_ids.get(object)
        if subject_id is None or object_id is None:
            continue
        link = context.make("UnknownLink")
        link.id = context.make_id(subject_id, object_id)
        link.add("subject", subject_id)
        link.add("object", object_id)
        context.emit(link)
Exemplo n.º 28
0
def parse_result(context, result):
    type_ = result.pop("type", None)
    schema = context.lookup_value("type", type_)
    if schema is None:
        context.log.error("Unknown result type", type=type_)
        return
    entity = context.make(schema)
    entity.id = context.make_slug(result.pop("id"))

    entity_number = result.pop("entity_number", None)
    if entity_number is not None:
        assert int(entity_number)
        entity.id = SDN.make_slug(entity_number)

    entity.add("name", result.pop("name", None))
    for alias in ensure_list(result.pop("alt_names", "")):
        entity.add("alias", alias.split("; "))
    entity.add("notes", result.pop("remarks", None))
    entity.add("country", result.pop("country", None))
    if entity.schema.is_a("Person"):
        entity.add("position", result.pop("title", None))
        entity.add("nationality", result.pop("nationalities", None))
        entity.add("nationality", result.pop("citizenships", None))
        for dob in result.pop("dates_of_birth", []):
            entity.add("birthDate", h.parse_date(dob, FORMATS))
        entity.add("birthPlace", result.pop("places_of_birth", None))
    elif entity.schema.is_a("Vessel"):
        entity.add("flag", result.pop("vessel_flag", None))
        entity.add("callSign", result.pop("call_sign", None))
        entity.add("type", result.pop("vessel_type", None))
        grt = result.pop("gross_registered_tonnage", None)
        entity.add("grossRegisteredTonnage", grt)
        gt = result.pop("gross_tonnage", None)
        entity.add("tonnage", gt)

        # TODO: make adjacent owner entity
        result.pop("vessel_owner", None)

    assert result.pop("title", None) is None
    assert not len(result.pop("nationalities", []))
    assert not len(result.pop("citizenships", []))
    assert not len(result.pop("dates_of_birth", []))
    assert not len(result.pop("places_of_birth", []))

    for address in result.pop("addresses", []):
        obj = h.make_address(
            context,
            street=address.get("address"),
            city=address.get("city"),
            postal_code=address.get("postal_code"),
            region=address.get("state"),
            country=address.get("country"),
        )
        h.apply_address(context, entity, obj)

    for ident in result.pop("ids", []):
        country = ident.pop("country")
        entity.add("country", country)
        h.apply_feature(
            context,
            entity,
            ident.pop("type"),
            ident.pop("number"),
            country=country,
            date_formats=FORMATS,
            start_date=ident.pop("issue_date", None),
            end_date=ident.pop("expiration_date", None),
        )

    sanction = context.make("Sanction")
    sanction.id = context.make_id(entity.id, "Sanction")
    sanction.add("entity", entity)
    sanction.add("program", result.pop("programs", []))
    sanction.add("status", result.pop("license_policy", []))
    sanction.add("reason", result.pop("license_requirement", []))
    sanction.add("reason", result.pop("federal_register_notice", None))
    sanction.add("startDate", result.pop("start_date", None))
    sanction.add("endDate", result.pop("end_date", None))
    sanction.add("country", "us")
    sanction.add("authority", result.pop("source", None))

    # TODO: deref
    source_url = deref_url(context, result.pop("source_information_url"))
    sanction.add("sourceUrl", source_url)
    result.pop("source_list_url")

    # TODO: what is this?
    result.pop("standard_order", None)

    context.emit(sanction)
    context.emit(entity, target=True)

    if len(result):
        context.pprint(result)
Exemplo n.º 29
0
def crawl_row(context: Context, data: Dict[str, str]):
    entity = context.make("LegalEntity")
    ind_id = data.pop("INDIVIDUAL_Id", data.pop("IndividualID"))
    entity.id = context.make_slug(ind_id)
    assert entity.id, data
    entity.add("notes", h.clean_note(data.pop("COMMENTS", None)))
    entity.add("notes", h.clean_note(data.pop("Comments", None)))
    entity.add("notes", h.clean_note(data.pop("NOTE", None)))
    entity.add("notes", h.clean_note(data.pop("NOTE1", None)))
    entity.add("notes", h.clean_note(data.pop("NOTE2", None)))
    entity.add("notes", h.clean_note(data.pop("NOTE3", None)))
    entity.add_cast("Person", "nationality", data.pop("NATIONALITY", None))
    entity.add_cast("Person", "nationality", data.pop("Nationality", None))
    entity.add_cast("Person", "title", data.pop("TITLE", None))
    entity.add_cast("Person", "title", data.pop("Title", None))
    entity.add_cast("Person", "position", data.pop("DESIGNATION", None))
    entity.add_cast("Person", "position", data.pop("Designation", None))
    entity.add_cast("Person", "birthPlace", data.pop("PLACEOFBIRTH", None))
    entity.add_cast("Person", "birthPlace",
                    data.pop("IndividualPlaceOfBirth", None))
    entity.add_cast("Person", "birthPlace", data.pop("CITY_OF_BIRTH", None))
    entity.add_cast("Person", "birthDate", data.pop("YEAR", None))
    entity.add_cast("Person", "gender", data.pop("GENDER", None))
    entity.add_cast("Person", "birthDate", parse_date(data.pop("DATE", None)))
    entity.add_cast("Person", "birthDate",
                    parse_date(data.pop("DATE_OF_BIRTH", None)))
    dob = parse_date(data.pop("IndividualDateOfBirth", None))
    entity.add_cast("Person", "birthDate", dob)

    data.pop("BIRTHPLACE_x0020_CITY", None)
    data.pop("BIRTHPLACE_x0020_STATE_PROVINCE", None)
    entity.add("country", data.pop("BIRTHPLACE_x0020_COUNTRY", None))
    entity.add("country", data.pop("COUNTRY_OF_BIRTH", None))
    entity.add_cast("Person", "birthPlace",
                    data.pop("BIRTHPLACE_x0020_NOTE", None))

    h.apply_name(
        entity,
        full=data.pop("FullName", None),
        given_name=data.pop("FIRST_NAME", None),
        second_name=data.pop("SECOND_NAME", None),
        name3=data.pop("THIRD_NAME", None),
        name4=data.pop("FOURTH_NAME", None),
        quiet=True,
    )

    alias = data.pop("NAME_ORIGINAL_SCRIPT", None)
    if alias is not None and "?" not in alias:
        entity.add("alias", alias)
    entity.add("alias", data.pop("SORT_KEY", None))
    data.pop("IndividualAlias", None)

    entity.add_cast("Person", "passportNumber", data.pop("PASSPORT", None))
    entity.add_cast("Person", "passportNumber",
                    data.pop("IndividualDocument", None))
    data.pop("DATE_OF_ISSUE", None)
    data.pop("CITY_OF_ISSUE", None)
    entity.add("country", data.pop("COUNTRY_OF_ISSUE", None))
    entity.add_cast("Person", "idNumber", data.pop("IDNUMBER", None))

    address = h.make_address(
        context,
        # remarks=data.pop("NOTE"),
        full=data.pop("IndividualAddress", None),
        street=data.pop("STREET", None),
        city=data.pop("CITY", None),
        region=data.pop("STATE_PROVINCE", None),
        postal_code=data.pop("ZIP_CODE", None),
        country=data.pop("COUNTRY", None),
    )
    h.apply_address(context, entity, address)

    sanction = h.make_sanction(context, entity)
    inserted_at = parse_date(data.pop("DateInserted", None))
    listed_on = data.pop("ListedON", data.pop("ListedOn", None))
    listed_at = parse_date(listed_on)
    entity.add("createdAt", inserted_at or listed_at)
    sanction.add("listingDate", listed_at or inserted_at)
    sanction.add("startDate", data.pop("FROM_YEAR", None))
    sanction.add("endDate", data.pop("TO_YEAR", None))
    sanction.add("program", data.pop("UN_LIST_TYPE", None))
    sanction.add("unscId", data.pop("REFERENCE_NUMBER", None))
    sanction.add("unscId", data.pop("ReferenceNumber", None))
    sanction.add("authority", data.pop("SUBMITTED_BY", None))

    entity.add("topics", "sanction")
    h.audit_data(data,
                 ignore=["VERSIONNUM", "TYPE_OF_DATE", "ApplicationStatus"])
    context.emit(entity, target=True)
    context.emit(sanction)
Exemplo n.º 30
0
def crawl_company(context: Context, data: Dict[str, Any]):
    entity = context.make("Organization")
    entity.id = company_id(context, data.pop("id"))
    entity.add("sourceUrl", data.pop("url_en", None))
    data.pop("url_ru", None)
    entity.add("name", data.pop("name_en", None))
    entity.add("name", data.pop("name_ru", None))
    entity.add("name", data.pop("name_suggest_output_ru", None))
    entity.add("alias", data.pop("also_known_as", None))
    entity.add("alias", data.pop("short_name_en", None))
    entity.add("alias", data.pop("short_name_ru", None))
    entity.add("incorporationDate", parse_date(data.pop("founded", None)))
    entity.add("dissolutionDate", parse_date(data.pop("closed", None)))
    entity.add("status", data.pop("status_en", data.pop("status_ru", None)))
    entity.add("status", data.pop("status", None))
    entity.add_cast("Company", "ogrnCode", data.pop("ogrn_code", None))
    entity.add("registrationNumber", data.pop("edrpou", None))

    for country_data in data.pop("related_countries", []):
        rel_type = country_data.pop("relationship_type")
        country_name = country_data.pop("to_country_en", None)
        country_name = country_name or country_data.pop("to_country_ru")
        # print(country_name)
        res = context.lookup("country_links", rel_type)
        if res is None:
            context.log.warn(
                "Unknown country link",
                rel_type=rel_type,
                entity=entity,
                country=country_name,
            )
            continue
        if res.prop is not None:
            entity.add(res.prop, country_name)
        # h.audit_data(country_data)

    for rel_data in data.pop("related_persons", []):
        other_wdid = clean_wdid(rel_data.pop("person_wikidata_id"))
        other_id = person_id(context, rel_data.pop("person_id"), other_wdid)

        rel_type = rel_data.pop("relationship_type_en", None)
        rel_type_ru = rel_data.pop("relationship_type_ru", None)
        rel_type = rel_type or rel_type_ru
        res = context.lookup("person_relations", rel_type)
        if res is None:
            context.log.info(
                "Unknown company/person relation type",
                rel_type=rel_type,
                entity=entity,
                other=other_id,
            )
            continue

        if res.schema is None:
            continue

        if res.schema == "Organization" and res.from_prop == "asset":
            entity.schema = model.get("Company")

        rel = context.make(res.schema)
        id_a_short = short_id(context, entity.id)
        id_b_short = short_id(context, other_id)
        rel.id = context.make_slug(id_a_short, res.schema, id_b_short)
        rel.add(res.from_prop, entity.id)
        rel.add(res.to_prop, other_id)
        rel.add(res.desc_prop, rel_type)
        rel.add("modifiedAt", parse_date(rel_data.pop("date_confirmed")))
        rel.add("startDate", parse_date(rel_data.pop("date_established")))
        rel.add("endDate", parse_date(rel_data.pop("date_finished")))
        context.emit(rel)

    for rel_data in data.pop("related_companies", []):
        # pprint(rel_data)
        # other_id = company_id(context, rel_data.pop("company_id"))

        # rel_type = rel_data.pop("relationship_type_en", None)
        # rel_type_ru = rel_data.pop("relationship_type_ru", None)
        # rel_type = rel_type or rel_type_ru
        # res = context.lookup("company_relations", rel_type)
        # if res is None:
        #     context.log.warn(
        #         "Unknown company/company relation type",
        #         rel_type=rel_type,
        #         entity=entity,
        #         other=other_id,
        #     )
        #     continue

        # if res.schema is None:
        #     continue

        # if res.schema == "Organization" and res.from_prop == "asset":
        #     entity.schema = model.get("Company")

        # rel = context.make(res.schema)
        # id_a_short = short_id(context, entity.id)
        # id_b_short = short_id(context, other_id)
        # rel.id = context.make_slug(id_a_short, res.schema, id_b_short)
        # rel.add(res.from_prop, entity.id)
        # rel.add(res.to_prop, other_id)
        # rel.add(res.desc_prop, rel_type)
        # rel.add("modifiedAt", parse_date(rel_data.pop("date_confirmed")))
        # rel.add("startDate", parse_date(rel_data.pop("date_established")))
        # rel.add("endDate", parse_date(rel_data.pop("date_finished")))
        # context.emit(rel)
        pass

    address = h.make_address(
        context,
        street=data.pop("street", None),
        city=data.pop("city", None),
    )
    h.apply_address(context, entity, address)

    if data.pop("state_company", False):
        entity.add("topics", "gov.soe")

    ignore = [
        "wiki",
        "bank_name",
        "other_founders",
        "other_owners",
        "other_managers",
        "other_recipient",
    ]
    h.audit_data(data, ignore=ignore)
    # print(entity.to_dict())
    context.emit(entity)