예제 #1
0
def crawl(context: Context):
    path = context.fetch_resource("source.json", context.dataset.data.url)
    context.export_resource(path, JSON, title=context.SOURCE_TITLE)
    with open(path, "r") as fh:
        data = json.load(fh)
    for record in data:
        bank = context.make("Company")
        charter_no = record.pop("CharterNumber")
        bank_name = record.pop("BankName")
        bank.id = context.make_slug(charter_no, bank_name)
        bank.add("name", bank_name)
        bank.add("registrationNumber", charter_no)
        bank.add("country", "us")
        bank.add("topics", "fin.bank")
        if bank.id is not None:
            context.emit(bank)
        company_name = record.pop("CompanyName")
        first_name = record.pop("FirstName")
        last_name = record.pop("LastName")
        if company_name:
            entity = context.make("Company")
            entity.id = context.make_id(charter_no, bank_name, company_name)
            entity.add("name", company_name)
        else:
            entity = context.make("Person")
            entity.id = context.make_id(charter_no, bank_name, first_name,
                                        last_name)
            h.apply_name(entity, first_name=first_name, last_name=last_name)
        entity.add("country", "us")
        entity.add("topics", "crime.fin")

        addr = h.make_address(
            context,
            city=record.pop("CityName"),
            state=record.pop("StateName"),
            country_code="us",
        )
        record.pop("StateAbbreviation")
        h.apply_address(context, entity, addr)

        sanction = h.make_sanction(context, entity)
        sanction.add("startDate", record.pop("CompleteDate", None))
        sanction.add("endDate", record.pop("TerminationDate", None))
        sanction.add("program", record.pop("EnforcementTypeDescription", None))
        sanction.add("authorityId", record.pop("DocketNumber", None))
        # context.pprint(record)
        context.emit(entity, target=True)
        context.emit(sanction)
예제 #2
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r") as fh:
        doc = html.parse(fh)
    table = doc.find('//div[@class="sanctioned-table"]/table')
    headers = None
    for row in table.findall(".//tr"):
        if headers is None:
            headers = [slugify(el.text) for el in row.findall("./th")]
            continue
        cells = [collapse_spaces(el.text) for el in row.findall("./td")]
        data = {hdr: c for hdr, c in zip(headers, cells)}

        entity = context.make("Person")
        entity.id = context.make_id(data["id"], data["ad-soyad-ata-adi"])
        entity.add("name", data["ad-soyad-ata-adi"])
        entity.add("idNumber", data["id"])
        entity.add("birthDate", parse_date(data["dogum-tarixi"]))
        entity.add("country", "az")
        entity.add("topics", "sanction")

        addr = h.make_address(context, full=data["malumat"])
        h.apply_address(context, entity, addr)

        sanction = h.make_sanction(context, entity)
        context.emit(sanction)
        context.emit(entity, target=True)
예제 #3
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r") as fh:
        doc = html.parse(fh)

    for table in doc.findall("//table"):
        headers = table.findall("./thead/tr/td")
        headers = [h.text_content() for h in headers]
        assert "Vendor name" in headers, headers
        assert "From" in headers, headers
        for row in table.findall("./tbody/tr"):
            cells = [h.text_content() for h in row.findall("./td")]
            if len(cells[0]) == 0:
                continue
            entity = context.make("LegalEntity")
            entity.id = context.make_id(*cells)
            entity.add("name", cells[0])
            entity.add("country", cells[1])
            entity.add("topics", "crime.fraud")

            cc = entity.first("country")
            address = h.make_address(context, full=cells[2], country_code=cc)
            h.apply_address(context, entity, address)

            sanction = h.make_sanction(context, entity)
            sanction.add("reason", cells[3])
            sanction.add("program", cells[4])
            sanction.add("startDate", parse_date(cells[5]))
            sanction.add("endDate", parse_date(cells[6]))

            context.emit(sanction)
            context.emit(entity, target=True)
예제 #4
0
def parse_party(context: Context, distinct_party, locations, documents):
    profile = distinct_party.find("Profile")
    sub_type = ref_get("PartySubType", profile.get("PartySubTypeID"))
    schema = TYPES.get(sub_type.get("Value"))
    type_ = ref_value("PartyType", sub_type.get("PartyTypeID"))
    schema = TYPES.get(type_, schema)
    if schema is None:
        context.log.error("Unknown party type", value=type_)
        return
    party = context.make(schema)
    party.id = context.make_slug(profile.get("ID"))
    party.add("notes", h.clean_note(distinct_party.findtext("Comment")))
    party.add("sourceUrl", URL % profile.get("ID"))

    for identity in profile.findall("./Identity"):
        parts = {}
        for group in identity.findall(".//NamePartGroup"):
            type_id = group.get("NamePartTypeID")
            parts[group.get("ID")] = ref_value("NamePartType", type_id)

        for alias in identity.findall("./Alias"):
            parse_alias(party, parts, alias)

        for regdoc in documents.get(identity.get("ID"), []):
            parse_registration_doc(context, party, regdoc)

    for feature in profile.findall("./Feature"):
        parse_feature(context, feature, party, locations)

    context.emit(party, target=True)
    # pprint(party.to_dict())
    # context.log.info("[%s] %s" % (party.schema.name, party.caption))
    return party
예제 #5
0
def crawl_person(context: Context) -> None:
    data = json_resource(context, context.dataset.data.url, "person")
    for row in data["data"]:
        row = clean_row(row)
        person_id = row.pop("person_id")
        name_en = row.pop("name_en", None)
        name_ru = row.pop("name_ru", None)
        name_uk = row.pop("name_uk", None)
        name = name_en or name_ru or name_uk
        entity = context.make("Person")
        entity.id = context.make_slug("person", person_id, name)
        entity.add("name", name)
        entity.add("alias", name_ru)
        entity.add("alias", name_uk)
        entity.add("birthDate", parse_date(row.pop("date_bd", None)))
        url = "https://sanctions.nazk.gov.ua/sanction-person/%s/"
        entity.add("sourceUrl", url % person_id)
        if row.get("city_bd_en") != "N/A":
            entity.add("birthPlace", row.pop("city_bd_en", None))
            entity.add("birthPlace", row.pop("city_bd_ru", None))
            entity.add("birthPlace", row.pop("city_bd_uk", None))
        entity.add("position", row.pop("position_en", None))
        entity.add("position", row.pop("position_ru", None))
        entity.add("position", row.pop("position_uk", None))
        entity.add("notes", row.pop("reasoning_en", None))
        entity.add("notes", row.pop("reasoning_ru", None))
        entity.add("notes", row.pop("reasoning_uk", None))

        country = row.get("country", None)
        entity.add("country", COUNTRIES[country])
        entity.add("topics", "sanction")
        context.emit(entity, target=True)
예제 #6
0
def crawl(context: Context):
    url = context.dataset.data.url
    headers = {"apikey": context.dataset.data.api_key}
    data = context.fetch_json(url, headers=headers)
    # TODO write this out to a source.json
    for data in data["response"]["ZPROCSUPP"]:
        # context.pprint(data)
        entity = context.make("LegalEntity")
        name = data.get("SUPP_NAME")
        ent_id = data.get("SUPP_ID")
        entity.id = context.make_slug(ent_id)
        names = clean_name(name)
        entity.add("name", names[0])
        entity.add("topics", "debarment")
        entity.add("country", data.get("COUNTRY_NAME"))
        for name in names[1:]:
            entity.add("alias", name)

        address = h.make_address(
            context,
            street=data.get("SUPP_ADDR"),
            city=data.get("SUPP_CITY"),
            country=data.get("COUNTRY_NAME"),
            key=entity.id,
        )
        h.apply_address(context, entity, address)

        sanction = h.make_sanction(context, entity)
        sanction.add("program", data.get("DEBAR_REASON"))
        sanction.add("startDate",
                     h.parse_date(data.get("DEBAR_FROM_DATE"), FORMATS))
        sanction.add("endDate", h.parse_date(data.get("DEBAR_TO_DATE"),
                                             FORMATS))
        context.emit(entity, target=True)
        context.emit(sanction)
예제 #7
0
def crawl_company(context: Context) -> None:
    data = json_resource(context, context.dataset.data.url, "company")
    for row in data["data"]:
        row = clean_row(row)
        # context.pprint(row)
        company_id = row.pop("company_id")
        name_en = row.pop("name_en", None)
        name = row.pop("name", None) or name_en
        entity = context.make("Organization")
        entity.id = context.make_slug("company", company_id, name)
        if entity.id is None:
            entity.id = context.make_slug(
                "company",
                company_id,
                row.pop("ogrn", None),
                strict=False,
            )
        entity.add("name", name)
        entity.add("name", name_en)
        entity.add("name", row.pop("name_uk", None))
        entity.add("name", row.pop("name_ru", None))
        entity.add("innCode", row.pop("inn", None))
        entity.add_cast("Company", "ogrnCode", row.pop("ogrn", None))

        country = row.pop("country", None)
        entity.add("country", COUNTRIES[country])
        entity.add("topics", "sanction")
        entity.add("notes", row.pop("reasoning_en", None))
        entity.add("notes", row.pop("reasoning_ru", None))
        entity.add("notes", row.pop("reasoning_uk", None))

        context.emit(entity, target=True)
        row.pop("logo_en", None)
예제 #8
0
def parse_row(context: Context, row):
    entity = context.make("LegalEntity")
    entity.id = context.make_slug(row.get("Effective_Date"), row.get("Name"))
    entity.add("name", row.get("Name"))
    entity.add("notes", row.get("Action"))
    entity.add("country", row.get("Country"))
    entity.add("modifiedAt", row.get("Last_Update"))

    address = h.make_address(
        context,
        street=row.get("Street_Address"),
        postal_code=row.get("Postal_Code"),
        city=row.get("City"),
        region=row.get("State"),
        country=row.get("Country"),
    )
    h.apply_address(context, entity, address)
    context.emit(entity, target=True)

    citation = row.get("FR_Citation")
    sanction = h.make_sanction(context, entity, key=citation)
    sanction.add("program", citation)
    sanction.add("startDate", h.parse_date(row.get("Effective_Date"), FORMATS))
    sanction.add("endDate", h.parse_date(row.get("Expiration_Date"), FORMATS))
    # pprint(row)
    context.emit(sanction)
예제 #9
0
def crawl_common(context: Context, data: Dict[str, str], part: str,
                 schema: str):
    entity = context.make(schema)
    entity.id = context.make_slug(part, data.pop("DATAID"))
    entity.add("topics", "sanction")
    entity.add("notes", h.clean_note(data.pop("COMMENTS1")))
    entity.add("notes", h.clean_note(data.pop("NOTE", None)))
    entity.add("alias", data.pop("NAME_ORIGINAL_SCRIPT"))

    h.apply_name(
        entity,
        name1=data.pop("FIRST_NAME", None),
        name2=data.pop("SECOND_NAME", None),
        name3=data.pop("THIRD_NAME", None),
        name4=data.pop("FOURTH_NAME", None),
        quiet=True,
    )

    sanction = h.make_sanction(context, entity)
    submitted_on = parse_date(data.pop("SUBMITTED_ON", None))
    listed_on = parse_date(data.pop("LISTED_ON"))
    modified_at = parse_date(data.pop("LAST_DAY_UPDATED"))
    entity.add("createdAt", submitted_on or listed_on or modified_at)
    entity.add("modifiedAt", modified_at)

    sanction.add("listingDate", submitted_on or listed_on)
    sanction.add("startDate", listed_on)
    sanction.add("program", data.pop("UN_LIST_TYPE"))
    sanction.add("program", data.pop("LIST_TYPE"))
    sanction.add("unscId", data.pop("REFERENCE_NUMBER"))
    sanction.add("authority", data.pop("SUBMITTED_BY", None))
    context.emit(sanction)
    return entity
예제 #10
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r") as fh:
        doc = html.parse(fh)

    for sec_id, (section, schema) in SECTIONS.items():
        el = doc.find(".//div[@id='%s']" % sec_id)
        for item in el.findall(".//li"):
            text = item.text_content().strip()
            index, text = text.split(".", 1)
            text = text.strip()
            if text.endswith(";"):
                text = text.rstrip(";")
            entity = context.make(schema)
            entity.id = context.make_id(text)
            sanction = h.make_sanction(context, entity)
            sanction.add("program", section)
            sanction.add("recordId", index)
            if sec_id == "russianUL":
                parse_russian_orgs(context, entity, text)
            if sec_id == "russianFL":
                parse_russian_persons(context, entity, text)
            if sec_id == "foreignUL":
                parse_foreign_orgs(context, entity, text)
            if sec_id == "foreignFL":
                parse_foreign_persons(context, entity, text)

            if entity.has("name"):
                context.emit(entity, target=True)
                context.emit(sanction)
예제 #11
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r", encoding="utf-8") as fh:
        doc = html.fromstring(fh.read())
    for table in doc.findall('.//div[@class="editor-content"]//table'):
        headers = None
        schema = None
        for row in table.findall(".//tr"):
            cells = [
                collapse_spaces(c.text_content()) for c in row.findall("./td")
            ]
            if headers is None:
                headers = [slugify(c, sep="_") for c in cells]
                continue
            if len(cells) == 1:
                schema = TYPES[cells[0]]
                continue
            row = dict(zip(headers, cells))

            entity = context.make(schema)
            name = row.pop("imie_i_nazwisko_nazwa_podmiotu")
            entity.id = context.make_slug(name)
            names = name.split("(")
            entity.add("name", names[0])
            for alias in names[1:]:
                entity.add("alias", alias.split(")")[0])
            notes = row.pop("uzasadnienie_wpisu_na_liste")
            entity.add("notes", notes)

            details = row.pop("dane_identyfikacyjne_osoby_podmiotu")
            for (chop, prop) in CHOPSKA:
                parts = details.rsplit(chop, 1)
                details = parts[0]
                if len(parts) > 1:
                    if prop == "address":
                        addr = h.make_address(context, full=parts[1])
                        h.apply_address(context, entity, addr)
                    else:
                        entity.add(prop, parts[1])
            if len(details.strip()):
                result = context.lookup("details", details)
                if result is None:
                    context.log.warning("Unhandled details", details=details)
                else:
                    for prop, value in result.props.items():
                        entity.add(prop, value)

            sanction = h.make_sanction(context, entity)
            provisions = row.pop("zastosowane_srodki_sankcyjne")
            sanction.add("provisions", provisions)

            start_date = row.pop("data_umieszczenia_na_liscie")
            start_date = start_date.replace(" r.", "")
            sanction.add("startDate", h.parse_date(start_date, ["%d.%m.%Y"]))

            h.audit_data(row)
            context.emit(entity, target=True)
            context.emit(sanction)
예제 #12
0
def parse_person(context: Context, data, country, lastmod):
    person_id = data.pop("id", None)
    person = context.make("Person")
    person.id = context.make_slug(person_id)
    person.add("nationality", country)
    name = data.get("name")
    if name is None or name.lower().strip() in ("unknown", ):
        return
    person.add("modifiedAt", lastmod.date())
    person.add("name", data.pop("name", None))
    person.add("alias", data.pop("sort_name", None))
    for other in data.pop("other_names", []):
        person.add("alias", other.get("name"))
    person.add("gender", data.pop("gender", None))
    person.add("title", data.pop("honorific_prefix", None))
    person.add("title", data.pop("honorific_suffix", None))
    person.add("firstName", data.pop("given_name", None))
    person.add("lastName", data.pop("family_name", None))
    person.add("fatherName", data.pop("patronymic_name", None))
    person.add("birthDate", data.pop("birth_date", None))
    person.add("deathDate", data.pop("death_date", None))
    person.add("email", h.clean_emails(data.pop("email", None)))
    person.add("notes", data.pop("summary", None))
    person.add("topics", "role.pep")

    for link in data.pop("links", []):
        url = link.get("url")
        if link.get("note") in ("website", "blog", "twitter", "facebook"):
            person.add("website", url)
        # elif "Wikipedia (" in link.get("note") and "wikipedia.org" in url:
        #     person.add("wikipediaUrl", url)
        # elif "wikipedia" in link.get("note") and "wikipedia.org" in url:
        #     person.add("wikipediaUrl", url)
        # else:
        #     person.log.info("Unknown URL", url=url, note=link.get("note"))

    for ident in data.pop("identifiers", []):
        identifier = ident.get("identifier")
        scheme = ident.get("scheme")
        if scheme == "wikidata" and identifier.startswith("Q"):
            person.add("wikidataId", identifier)

    for contact_detail in data.pop("contact_details", []):
        value = contact_detail.get("value")
        if "email" == contact_detail.get("type"):
            person.add("email", h.clean_emails(value))
        if "phone" == contact_detail.get("type"):
            person.add("phone", h.clean_phones(value))

    if check_person_cutoff(person):
        return

    # data.pop("image", None)
    # data.pop("images", None)
    # if len(data):
    #     pprint(data)
    context.emit(person, target=True)
    # entities[person_id] = person.id
    return person.id
예제 #13
0
def parse_individual(context: Context, node):
    person = context.make("Person")
    sanction = parse_common(context, person, node)
    person.add("title", values(node.find("./TITLE")))
    person.add("position", values(node.find("./DESIGNATION")))

    for alias in node.findall("./INDIVIDUAL_ALIAS"):
        parse_alias(person, alias)

    for addr in node.findall("./INDIVIDUAL_ADDRESS"):
        h.apply_address(context, person, parse_address(context, addr))

    for doc in node.findall("./INDIVIDUAL_DOCUMENT"):
        passport = context.make("Passport")
        number = doc.findtext("./NUMBER")
        date = doc.findtext("./DATE_OF_ISSUE")
        type_ = doc.findtext("./TYPE_OF_DOCUMENT")
        if number is None and date is None and type_ is None:
            continue
        passport.id = context.make_id(person.id, number, date, type_)
        passport.add("holder", person)
        passport.add("passportNumber", number)
        passport.add("startDate", date)
        passport.add("type", type_)
        passport.add("type", doc.findtext("./TYPE_OF_DOCUMENT2"))
        passport.add("summary", doc.findtext("./NOTE"))
        country = doc.findtext("./COUNTRY_OF_ISSUE")
        country = country or doc.findtext("./ISSUING_COUNTRY")
        passport.add("country", country)
        context.emit(passport)

    for nat in node.findall("./NATIONALITY/VALUE"):
        person.add("nationality", nat.text)

    for dob in node.findall("./INDIVIDUAL_DATE_OF_BIRTH"):
        date = dob.findtext("./DATE") or dob.findtext("./YEAR")
        person.add("birthDate", date)

    for pob in node.findall("./INDIVIDUAL_PLACE_OF_BIRTH"):
        address = parse_address(context, pob)
        if address is not None:
            person.add("birthPlace", address.get("full"))
            person.add("country", address.get("country"))

    context.emit(person, target=True)
    context.emit(sanction)
예제 #14
0
def parse_entry(context: Context, node: _Element):
    entity_name = node.findtext("./Entity")
    dob = node.findtext("./DateOfBirth")
    schedule = node.findtext("./Schedule")
    if schedule == "N/A":
        schedule = ""
    program = node.findtext("./Country")
    item = node.findtext("./Item")
    if entity_name is not None:
        entity = context.make("LegalEntity")
        entity.add("name", entity_name.split("/"))
    else:
        entity = context.make("Person")
        given_name = node.findtext("./GivenName")
        last_name = node.findtext("./LastName")
        entity_name = h.make_name(given_name=given_name, last_name=last_name)
        entity.add("name", entity_name)
        entity.add("birthDate", dob)

    country = program
    if program is not None and "/" in program:
        country, _ = program.split("/")
    entity.add("country", country)

    entity.id = context.make_slug(
        schedule,
        item,
        entity.first("country"),
        entity_name,
        strict=False,
    )

    sanction = h.make_sanction(context, entity)
    sanction.add("program", program)
    sanction.add("reason", schedule)
    sanction.add("authorityId", item)

    names = node.findtext("./Aliases")
    if names is not None:
        for name in names.split(", "):
            name = collapse_spaces(name)
            entity.add("alias", name)

    entity.add("topics", "sanction")
    context.emit(entity, target=True)
    context.emit(sanction)
예제 #15
0
def parse_reference(context: Context, reference: int, rows):
    schemata = set()
    for row in rows:
        type_ = row.pop("type")
        schema = context.lookup_value("type", type_)
        if schema is None:
            context.log.warning("Unknown entity type", type=type_)
            return
        schemata.add(schema)
    assert len(schemata) == 1, schemata
    entity = context.make(schemata.pop())

    primary_name = None
    for row in rows:
        name = row.pop("name_of_individual_or_entity", None)
        name_type = row.pop("name_type")
        name_prop = context.lookup_value("name_type", name_type)
        if name_prop is None:
            context.log.warning("Unknown name type", name_type=name_type)
            return
        entity.add(name_prop, name)
        if name_prop == "name":
            primary_name = name

    entity.id = context.make_slug(reference, primary_name)
    sanction = h.make_sanction(context, entity)

    primary_name = None
    for row in rows:
        addr = row.pop("address")
        if addr is not None:
            for part in multi_split(addr, SPLITS):
                address = h.make_address(context, full=part)
                h.apply_address(context, entity, address)
        sanction.add("program", row.pop("committees"))
        citizen = multi_split(row.pop("citizenship"), ["a)", "b)", "c)", "d)"])
        entity.add("nationality", citizen, quiet=True)
        dates = clean_date(row.pop("date_of_birth"))
        entity.add("birthDate", dates, quiet=True)
        entity.add("birthPlace", row.pop("place_of_birth"), quiet=True)
        entity.add("notes", h.clean_note(row.pop("additional_information")))
        listing_info = row.pop("listing_information")
        if isinstance(listing_info, datetime):
            entity.add("createdAt", listing_info)
            sanction.add("listingDate", listing_info)
        else:
            sanction.add("summary", listing_info)
        # TODO: consider parsing if it's not a datetime?

        control_date = row.pop("control_date")
        sanction.add("startDate", control_date)
        entity.add("createdAt", control_date)

    entity.add("topics", "sanction")
    context.emit(entity, target=True)
    context.emit(sanction)
예제 #16
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r", encoding="ISO-8859-1") as fh:
        doc = html.parse(fh)

    table = doc.find("//div[@id='viewcontainer']/table")
    headers = None
    for row in table.findall(".//tr"):
        if headers is None:
            headers = [
                slugify(c.text_content(), "_") for c in row.findall("./th")
            ]
            continue
        cells = [
            collapse_spaces(c.text_content()) for c in row.findall("./td")
        ]
        cells = dict(zip(headers, cells))
        cells.pop(None, None)

        full_name = name = cells.pop("name")
        registration_number = None
        for splitter in REG_NRS:
            if splitter in name:
                name, registration_number = name.split(splitter, 1)
                registration_number = registration_number.replace(")", "")

        country = cells.pop("nationality")
        country = country.replace("Non ADB Member Country", "")
        country = country.replace("Rep. of", "")
        entity = context.make("LegalEntity")
        entity.id = context.make_id(full_name, country)
        entity.add("name", name)
        entity.add("alias", cells.pop("othername_logo"))
        entity.add("topics", "debarment")
        entity.add("country", country)
        entity.add("registrationNumber", registration_number)

        sanction = h.make_sanction(context, entity)
        sanction.add("reason", cells.pop("grounds"))
        sanction.add("program", cells.pop("sanction_type"))
        date_range = cells.pop("effect_date_lapse_date", "")
        if "|" in date_range:
            start_date, end_date = date_range.split("|")
            sanction.add("startDate", h.parse_date(start_date.strip(),
                                                   FORMATS))
            sanction.add("endDate", h.parse_date(end_date.strip(), FORMATS))

        address = h.make_address(context,
                                 full=cells.pop("address"),
                                 country=country)
        h.apply_address(context, entity, address)

        context.emit(entity, target=True)
        context.emit(sanction)
예제 #17
0
def crawl_row(context: Context, row: Dict[str, str]):
    qid = row.get("qid", "").strip()
    if not len(qid):
        return
    if not is_qid(qid):
        context.log.warning("No valid QID", qid=qid)
        return
    entity = context.make("Person")
    entity.id = qid
    entity.add("topics", "role.oligarch")
    context.emit(entity, target=True)
예제 #18
0
def make_entity(context: Context, el, schema, entity_id):
    entity = context.make(schema, target=True)
    entity.id = entity_id
    entity.add("notes", h.clean_note(el.findtext("./note")))
    entity.add("topics", "sanction")

    sanction = h.make_sanction(context, entity)
    sanction.add("summary", el.findtext("./correction"))
    context.emit(sanction)

    return entity
예제 #19
0
def crawl_node(context: Context, node):
    mep_id = node.findtext(".//id")
    person = context.make("Person")
    person.id = context.make_slug(mep_id)
    url = "http://www.europarl.europa.eu/meps/en/%s" % mep_id
    person.add("sourceUrl", url)
    name = node.findtext(".//fullName")
    person.add("name", name)
    first_name, last_name = split_name(name)
    person.add("firstName", first_name)
    person.add("lastName", last_name)
    person.add("nationality", node.findtext(".//country"))
    person.add("topics", "role.pep")
    context.emit(person, target=True)

    party_name = node.findtext(".//nationalPoliticalGroup")
    if party_name not in ["Independent"]:
        party = context.make("Organization")
        party.id = context.make_slug("npg", party_name)
        if party.id is not None:
            party.add("name", party_name)
            party.add("country", node.findtext(".//country"))
            context.emit(party)
            membership = context.make("Membership")
            membership.id = context.make_id(person.id, party.id)
            membership.add("member", person)
            membership.add("organization", party)
            context.emit(membership)

    group_name = node.findtext(".//politicalGroup")
    group = context.make("Organization")
    group.id = context.make_slug("pg", group_name)
    if group.id is not None:
        group.add("name", group_name)
        group.add("country", "eu")
        context.emit(group)
        membership = context.make("Membership")
        membership.id = context.make_id(person.id, group.id)
        membership.add("member", person)
        membership.add("organization", group)
        context.emit(membership)
예제 #20
0
def salvage_entity(context: Context, entry):
    texts = [t.text for t in entry.findall("./remark")]
    assert len(texts) == 2, texts
    name, details = texts
    name = name.split("(", 1)[0]
    entity = context.make("LegalEntity")
    entity.id = context.make_id(name)
    entity.add("name", name)
    entity.add("notes", details)
    entity.add("topics", "sanction")
    parse_sanctions(context, entity, entry)
    context.emit(entity, target=True)
예제 #21
0
def parse_entity(context: Context, node):
    entity = context.make("LegalEntity")
    sanction = parse_common(context, entity, node)

    for alias in node.findall("./ENTITY_ALIAS"):
        parse_alias(entity, alias)

    for addr in node.findall("./ENTITY_ADDRESS"):
        h.apply_address(context, entity, parse_address(context, addr))

    context.emit(entity, target=True)
    context.emit(sanction)
예제 #22
0
def crawl_row(context: Context, row):
    entity = context.make("Person")
    tag = row.pop("Tag")
    name_en = row.pop("Name eng")
    dob = row.pop("DOB")
    entity.id = context.make_id(name_en, tag, dob)
    entity.add("name", name_en)
    entity.add("alias", row.get("Name cyrillic"))
    entity.add("birthDate", parse_date(dob))
    entity.add("notes", collapse_spaces(row.get("Description")))
    entity.add("position", tag.split("\n"))
    entity.add("gender", row.get("Gender"))

    context.emit(entity, target=True)
예제 #23
0
def crawl_row(context: Context, row: Dict[str, str]):
    qid = row.get("qid", "").strip()
    if not len(qid):
        return
    if not is_qid(qid):
        context.log.warning("No valid QID", qid=qid)
        return
    schema = row.get("schema") or "Person"
    entity = context.make(schema)
    entity.id = qid
    topics = [t.strip() for t in row.get("topics", "").split(";")]
    topics = [t for t in topics if len(t)]
    entity.add("topics", topics)
    context.emit(entity, target=True)
예제 #24
0
def parse_identity(context: Context, entity, node, places):
    for name in node.findall(".//name"):
        parse_name(entity, name)

    for address in node.findall(".//address"):
        place = places.get(address.get("place-id"))
        address = compose_address(context, entity, place, address)
        h.apply_address(context, entity, address)

    for bday in node.findall(".//day-month-year"):
        bval = parse_parts(bday.get("year"), bday.get("month"),
                           bday.get("day"))
        if entity.schema.is_a("Person"):
            entity.add("birthDate", bval)
        else:
            entity.add("incorporationDate", bval)

    for nationality in node.findall(".//nationality"):
        country = nationality.find("./country")
        if country is not None:
            entity.add("nationality", country.get("iso-code"))
            entity.add("nationality", country.text)

    for bplace in node.findall(".//place-of-birth"):
        place = places.get(bplace.get("place-id"))
        address = compose_address(context, entity, place, bplace)
        entity.add("birthPlace", address.get("full"))

    for doc in node.findall(".//identification-document"):
        country = doc.find("./issuer")
        type_ = doc.get("document-type")
        number = doc.findtext("./number")
        entity.add("nationality", country.text, quiet=True)
        schema = "Identification"
        if type_ in ("id-card"):
            entity.add("idNumber", number)
        if type_ in ("passport", "diplomatic-passport"):
            entity.add("idNumber", number)
            schema = "Passport"
        passport = context.make(schema)
        passport.id = context.make_id(entity.id, type_, doc.get("ssid"))
        passport.add("holder", entity)
        passport.add("country", country.text)
        passport.add("number", number)
        passport.add("type", type_)
        passport.add("summary", doc.findtext("./remark"))
        passport.add("startDate", doc.findtext("./date-of-issue"))
        passport.add("endDate", doc.findtext("./expiry-date"))
        context.emit(passport)
예제 #25
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r") as fh:
        doc = html.parse(fh)

    for node in doc.findall(".//td[@class='tailSTxt']"):
        if not node.text_content().startswith("2."):
            continue
        for item in node.findall(".//tr"):
            number = item.find(".//td[@class='sProvP1No']").text_content()
            text = item.findtext(".//td[@class='sProvP1']")
            text = text.strip().rstrip(";").rstrip(".")
            name, _ = text.split("(", 1)
            names = multi_split(name, ["s/o", "@"])

            entity = context.make("Person")
            entity.id = context.make_slug(number, name)
            entity.add("name", names)
            entity.add("topics", "sanction")

            sanction = h.make_sanction(context, entity)
            sanction.add("program", PROGRAM)

            for match in IN_BRACKETS.findall(text):
                # match = match.replace("\xa0", "")
                res = context.lookup("props", match)
                if res is not None:
                    for prop, value in res.props.items():
                        entity.add(prop, value)
                    continue
                if match.endswith("citizen"):
                    nat = match.replace("citizen", "")
                    entity.add("nationality", nat)
                    continue
                if match.startswith(DOB):
                    dob = match.replace(DOB, "").strip()
                    entity.add("birthDate", h.parse_date(dob, ["%d %B %Y"]))
                    continue
                if match.startswith(PASSPORT):
                    passport = match.replace(PASSPORT, "").strip()
                    entity.add("passportNumber", passport)
                    continue
                context.log.warn("Unparsed bracket term", term=match)

            context.emit(entity, target=True)
            context.emit(sanction)
예제 #26
0
def crawl(context: Context):
    path = context.fetch_resource("source.json", context.dataset.data.url)
    context.export_resource(path, JSON, title=context.SOURCE_TITLE)
    with open(path, "r") as fh:
        data = json.load(fh)
    for entry in data.get("result", []):
        wallet = context.make("CryptoWallet", target=True)
        wallet.id = context.make_slug(entry.get("address"))
        wallet.add("publicKey", entry.pop("address"))
        wallet.add("topics", "crime.theft")
        wallet.add("createdAt", entry.pop("createdAt"))
        wallet.add("modifiedAt", entry.pop("updatedAt"))
        wallet.add("alias", entry.pop("family"))
        wallet.add("balance", format_number(entry.pop("balance")))
        wallet.add("amountUsd", format_number(entry.pop("balanceUSD")))
        wallet.add("currency", entry.pop("blockchain"))
        h.audit_data(entry, ignore=["transactions"])
        context.emit(wallet)
예제 #27
0
def parse_person(context: Context, node):
    entity = context.make("Person")
    h.apply_name(
        entity,
        given_name=node.findtext("./Name"),
        patronymic=node.findtext("./Patronomic"),
        last_name=node.findtext("./Surname"),
    )
    entity.id = context.make_id(
        node.tag,
        node.findtext("./Number"),
        node.findtext("./Name"),
        node.findtext("./Patronomic"),
        node.findtext("./Surname"),
    )
    entity.add("birthDate", h.parse_date(node.findtext("./DataBirth"),
                                         FORMATS))
    entity.add("birthPlace", node.findtext("./PlaceBirth"))
    parse_common(context, node, entity)
예제 #28
0
def parse_membership(context: Context, data, persons, organizations, events):
    person_id = persons.get(data.pop("person_id", None))
    org_name = organizations.get(data.pop("organization_id", None))

    if person_id and org_name:
        period_id = data.get("legislative_period_id")
        period = events.get(period_id, {})
        comment = data.pop("role", None)
        comment = comment or period.get("name")
        starts = [data.get("start_date"), period.get("start_date")]
        ends = [data.get("end_date"), period.get("end_date")]
        # for source in data.get("sources", []):
        #     membership.add("sourceUrl", source.get("url"))

        position = post_summary(org_name, comment, starts, ends, [])
        person = context.make("Person")
        person.id = person_id
        person.add("position", position)
        context.emit(person, target=True)
예제 #29
0
def crawl(context: Context):
    path = context.fetch_resource("source.csv", context.dataset.data.url)
    context.export_resource(path, CSV, title=context.SOURCE_TITLE)
    prev_country = None
    with open(path, "r") as fh:
        for row in csv.DictReader(fh):
            country = row.get("catalog")
            if country != prev_country:
                context.log.info("Crawl country", country=country)
                prev_country = country
            entity = context.make("Person")
            qid: Optional[str] = row.get("personID")
            if qid is None or not is_qid(qid):
                continue
            entity.id = qid
            entity.add("name", row.get("person"))
            entity.add("topics", "role.pep")
            entity.add("country", country)
            context.emit(entity, target=True)
예제 #30
0
def crawl_physical(context: Context) -> None:
    data = json_resource(context, PHYSICAL_URL, "physical")
    for row in data:
        entity = context.make("Person")
        entity.id = context.make_slug(row.pop("ukaz_id"), row.pop("index"))
        entity.add("name", row.pop("name_ukr", None))
        entity.add("name", row.pop("name_original", None))
        for alias in multi_split(row.pop("name_alternative", None),
                                 [";", "/"]):
            entity.add("alias", alias)
        entity.add("notes", row.pop("additional", None))
        for country in multi_split(row.pop("citizenship", None), [", "]):
            entity.add("nationality", country)
        entity.add("birthDate", row.pop("birthdate", None))
        entity.add("birthPlace", row.pop("birthplace", None))
        entity.add("position", row.pop("occupation", None))
        handle_address(context, entity, row.pop("livingplace", None))
        handle_sanction(context, entity, row)

        context.emit(entity, target=True)