def parse_entry(context, node): entity_name = node.findtext("./Entity") if entity_name is not None: entity = context.make("LegalEntity") entity.add("name", entity_name.split("/")) else: entity = context.make("Person") given_name = node.findtext("./GivenName") entity.add("firstName", given_name) last_name = node.findtext("./LastName") entity.add("lastName", last_name) entity.add("name", jointext(given_name, last_name)) entity.add("birthDate", node.findtext("./DateOfBirth")) # ids are per country and entry type (individual/entity) item = node.findtext("./Item") schedule = node.findtext("./Schedule") country = node.findtext("./Country") if "/" in country: country, _ = country.split("/") entity.id = context.make_slug(country, schedule, item, strict=False) entity.add("country", country) sanction = h.make_sanction(context, entity) sanction.add("program", schedule) names = node.findtext("./Aliases") if names is not None: for name in names.split(", "): name = collapse_spaces(name) entity.add("alias", name) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def parse_entry(context, entry): party = context.make("Thing") party.id = context.make_slug(entry.get("ProfileID")) sanction = h.make_sanction(context, party, key=entry.get("ID")) sanction.add("program", ref_value("List", entry.get("ListID"))) dates = set() for event in entry.findall("./EntryEvent"): date = parse_date(event.find("./Date")) dates.add(date) sanction.add("startDate", date) sanction.add("summary", event.findtext("./Comment")) basis = ref_value("LegalBasis", event.get("LegalBasisID")) sanction.add("reason", basis) if len(dates): party.context["created_at"] = min(dates) party.context["updated_at"] = max(dates) for measure in entry.findall("./SanctionsMeasure"): sanction.add("summary", measure.findtext("./Comment")) type_id = measure.get("SanctionsTypeID") sanction.add("program", ref_value("SanctionsType", type_id)) context.emit(sanction)
def parse_entry(context: Context, entry, parties): party_id = context.make_slug(entry.get("ProfileID")) party = parties[party_id] sanction = h.make_sanction(context, party, key=entry.get("ID")) sanction.add("program", ref_value("List", entry.get("ListID"))) for event in entry.findall("./EntryEvent"): date = parse_date(event.find("./Date")) party.add("createdAt", date) sanction.add("summary", event.findtext("./Comment")) basis = ref_value("LegalBasis", event.get("LegalBasisID")) sanction.add("reason", basis) party.add("topics", "sanction") sanction.add("listingDate", party.get("createdAt")) sanction.add("startDate", party.get("modifiedAt")) for measure in entry.findall("./SanctionsMeasure"): sanction.add("summary", measure.findtext("./Comment")) type_id = measure.get("SanctionsTypeID") sanction.add("program", ref_value("SanctionsType", type_id)) context.emit(sanction) context.emit(party, target=True)
def crawl(context: Context): url = context.dataset.data.url headers = {"apikey": context.dataset.data.api_key} data = context.fetch_json(url, headers=headers) # TODO write this out to a source.json for data in data["response"]["ZPROCSUPP"]: # context.pprint(data) entity = context.make("LegalEntity") name = data.get("SUPP_NAME") ent_id = data.get("SUPP_ID") entity.id = context.make_slug(ent_id) names = clean_name(name) entity.add("name", names[0]) entity.add("topics", "debarment") entity.add("country", data.get("COUNTRY_NAME")) for name in names[1:]: entity.add("alias", name) address = h.make_address( context, street=data.get("SUPP_ADDR"), city=data.get("SUPP_CITY"), country=data.get("COUNTRY_NAME"), key=entity.id, ) h.apply_address(context, entity, address) sanction = h.make_sanction(context, entity) sanction.add("program", data.get("DEBAR_REASON")) sanction.add("startDate", h.parse_date(data.get("DEBAR_FROM_DATE"), FORMATS)) sanction.add("endDate", h.parse_date(data.get("DEBAR_TO_DATE"), FORMATS)) context.emit(entity, target=True) context.emit(sanction)
def parse_reference(context, reference, rows): entity = context.make("LegalEntity") entity.id = context.make_slug(reference) # entity.add("sourceUrl", context.dataset.url) sanction = h.make_sanction(context, entity) for row in rows: if row.pop("type") == "Individual": entity.schema = model.get("Person") name = row.pop("name_of_individual_or_entity", None) if row.pop("name_type") == "aka": entity.add("alias", name) else: entity.add("name", name) address = h.make_address(context, full=row.pop("address")) h.apply_address(context, entity, address) sanction.add("program", row.pop("committees")) citizen = multi_split(row.pop("citizenship"), ["a)", "b)", "c)", "d)"]) entity.add("nationality", citizen, quiet=True) dates = clean_date(row.pop("date_of_birth")) entity.add("birthDate", dates, quiet=True) entity.add("birthPlace", row.pop("place_of_birth"), quiet=True) entity.add("notes", row.pop("additional_information")) entity.add("notes", row.pop("listing_information"), quiet=True) control_date = row.pop("control_date") sanction.add("modifiedAt", control_date) entity.add("modifiedAt", control_date) entity.context["updated_at"] = control_date.isoformat() entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def parse_row(context: Context, row): entity = context.make("LegalEntity") entity.id = context.make_slug(row.get("Effective_Date"), row.get("Name")) entity.add("name", row.get("Name")) entity.add("notes", row.get("Action")) entity.add("country", row.get("Country")) entity.add("modifiedAt", row.get("Last_Update")) address = h.make_address( context, street=row.get("Street_Address"), postal_code=row.get("Postal_Code"), city=row.get("City"), region=row.get("State"), country=row.get("Country"), ) h.apply_address(context, entity, address) context.emit(entity, target=True) citation = row.get("FR_Citation") sanction = h.make_sanction(context, entity, key=citation) sanction.add("program", citation) sanction.add("startDate", h.parse_date(row.get("Effective_Date"), FORMATS)) sanction.add("endDate", h.parse_date(row.get("Expiration_Date"), FORMATS)) # pprint(row) context.emit(sanction)
def crawl_common(context: Context, data: Dict[str, str], part: str, schema: str): entity = context.make(schema) entity.id = context.make_slug(part, data.pop("DATAID")) entity.add("topics", "sanction") entity.add("notes", h.clean_note(data.pop("COMMENTS1"))) entity.add("notes", h.clean_note(data.pop("NOTE", None))) entity.add("alias", data.pop("NAME_ORIGINAL_SCRIPT")) h.apply_name( entity, name1=data.pop("FIRST_NAME", None), name2=data.pop("SECOND_NAME", None), name3=data.pop("THIRD_NAME", None), name4=data.pop("FOURTH_NAME", None), quiet=True, ) sanction = h.make_sanction(context, entity) submitted_on = parse_date(data.pop("SUBMITTED_ON", None)) listed_on = parse_date(data.pop("LISTED_ON")) modified_at = parse_date(data.pop("LAST_DAY_UPDATED")) entity.add("createdAt", submitted_on or listed_on or modified_at) entity.add("modifiedAt", modified_at) sanction.add("listingDate", submitted_on or listed_on) sanction.add("startDate", listed_on) sanction.add("program", data.pop("UN_LIST_TYPE")) sanction.add("program", data.pop("LIST_TYPE")) sanction.add("unscId", data.pop("REFERENCE_NUMBER")) sanction.add("authority", data.pop("SUBMITTED_BY", None)) context.emit(sanction) return entity
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) table = doc.find('//div[@class="sanctioned-table"]/table') headers = None for row in table.findall(".//tr"): if headers is None: headers = [slugify(el.text) for el in row.findall("./th")] continue cells = [collapse_spaces(el.text) for el in row.findall("./td")] data = {hdr: c for hdr, c in zip(headers, cells)} entity = context.make("Person") entity.id = context.make_id(data["id"], data["ad-soyad-ata-adi"]) entity.add("name", data["ad-soyad-ata-adi"]) entity.add("idNumber", data["id"]) entity.add("birthDate", parse_date(data["dogum-tarixi"])) entity.add("country", "az") entity.add("topics", "sanction") addr = h.make_address(context, full=data["malumat"]) h.apply_address(context, entity, addr) sanction = h.make_sanction(context, entity) context.emit(sanction) context.emit(entity, target=True)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) for table in doc.findall("//table"): headers = table.findall("./thead/tr/td") headers = [h.text_content() for h in headers] assert "Vendor name" in headers, headers assert "From" in headers, headers for row in table.findall("./tbody/tr"): cells = [h.text_content() for h in row.findall("./td")] if len(cells[0]) == 0: continue entity = context.make("LegalEntity") entity.id = context.make_id(*cells) entity.add("name", cells[0]) entity.add("country", cells[1]) entity.add("topics", "crime.fraud") cc = entity.first("country") address = h.make_address(context, full=cells[2], country_code=cc) h.apply_address(context, entity, address) sanction = h.make_sanction(context, entity) sanction.add("reason", cells[3]) sanction.add("program", cells[4]) sanction.add("startDate", parse_date(cells[5])) sanction.add("endDate", parse_date(cells[6])) context.emit(sanction) context.emit(entity, target=True)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) for sec_id, (section, schema) in SECTIONS.items(): el = doc.find(".//div[@id='%s']" % sec_id) for item in el.findall(".//li"): text = item.text_content().strip() index, text = text.split(".", 1) text = text.strip() if text.endswith(";"): text = text.rstrip(";") entity = context.make(schema) entity.id = context.make_id(text) sanction = h.make_sanction(context, entity) sanction.add("program", section) sanction.add("recordId", index) if sec_id == "russianUL": parse_russian_orgs(context, entity, text) if sec_id == "russianFL": parse_russian_persons(context, entity, text) if sec_id == "foreignUL": parse_foreign_orgs(context, entity, text) if sec_id == "foreignFL": parse_foreign_persons(context, entity, text) if entity.has("name"): context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r", encoding="utf-8") as fh: doc = html.fromstring(fh.read()) for table in doc.findall('.//div[@class="editor-content"]//table'): headers = None schema = None for row in table.findall(".//tr"): cells = [ collapse_spaces(c.text_content()) for c in row.findall("./td") ] if headers is None: headers = [slugify(c, sep="_") for c in cells] continue if len(cells) == 1: schema = TYPES[cells[0]] continue row = dict(zip(headers, cells)) entity = context.make(schema) name = row.pop("imie_i_nazwisko_nazwa_podmiotu") entity.id = context.make_slug(name) names = name.split("(") entity.add("name", names[0]) for alias in names[1:]: entity.add("alias", alias.split(")")[0]) notes = row.pop("uzasadnienie_wpisu_na_liste") entity.add("notes", notes) details = row.pop("dane_identyfikacyjne_osoby_podmiotu") for (chop, prop) in CHOPSKA: parts = details.rsplit(chop, 1) details = parts[0] if len(parts) > 1: if prop == "address": addr = h.make_address(context, full=parts[1]) h.apply_address(context, entity, addr) else: entity.add(prop, parts[1]) if len(details.strip()): result = context.lookup("details", details) if result is None: context.log.warning("Unhandled details", details=details) else: for prop, value in result.props.items(): entity.add(prop, value) sanction = h.make_sanction(context, entity) provisions = row.pop("zastosowane_srodki_sankcyjne") sanction.add("provisions", provisions) start_date = row.pop("data_umieszczenia_na_liscie") start_date = start_date.replace(" r.", "") sanction.add("startDate", h.parse_date(start_date, ["%d.%m.%Y"])) h.audit_data(row) context.emit(entity, target=True) context.emit(sanction)
def handle_sanction(context, entity, row): sanction = h.make_sanction(context, entity) sanction.add("status", row.pop("action", None)) sanction.add("summary", row.pop("restriction_period", None)) sanction.add("program", row.pop("restriction_type", None)) sanction.add("startDate", row.pop("ukaz_date", None)) sanction.add("endDate", row.pop("restriction_end_date", None)) context.emit(sanction)
def parse_reference(context: Context, reference: int, rows): schemata = set() for row in rows: type_ = row.pop("type") schema = context.lookup_value("type", type_) if schema is None: context.log.warning("Unknown entity type", type=type_) return schemata.add(schema) assert len(schemata) == 1, schemata entity = context.make(schemata.pop()) primary_name = None for row in rows: name = row.pop("name_of_individual_or_entity", None) name_type = row.pop("name_type") name_prop = context.lookup_value("name_type", name_type) if name_prop is None: context.log.warning("Unknown name type", name_type=name_type) return entity.add(name_prop, name) if name_prop == "name": primary_name = name entity.id = context.make_slug(reference, primary_name) sanction = h.make_sanction(context, entity) primary_name = None for row in rows: addr = row.pop("address") if addr is not None: for part in multi_split(addr, SPLITS): address = h.make_address(context, full=part) h.apply_address(context, entity, address) sanction.add("program", row.pop("committees")) citizen = multi_split(row.pop("citizenship"), ["a)", "b)", "c)", "d)"]) entity.add("nationality", citizen, quiet=True) dates = clean_date(row.pop("date_of_birth")) entity.add("birthDate", dates, quiet=True) entity.add("birthPlace", row.pop("place_of_birth"), quiet=True) entity.add("notes", h.clean_note(row.pop("additional_information"))) listing_info = row.pop("listing_information") if isinstance(listing_info, datetime): entity.add("createdAt", listing_info) sanction.add("listingDate", listing_info) else: sanction.add("summary", listing_info) # TODO: consider parsing if it's not a datetime? control_date = row.pop("control_date") sanction.add("startDate", control_date) entity.add("createdAt", control_date) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r", encoding="ISO-8859-1") as fh: doc = html.parse(fh) table = doc.find("//div[@id='viewcontainer']/table") headers = None for row in table.findall(".//tr"): if headers is None: headers = [ slugify(c.text_content(), "_") for c in row.findall("./th") ] continue cells = [ collapse_spaces(c.text_content()) for c in row.findall("./td") ] cells = dict(zip(headers, cells)) cells.pop(None, None) full_name = name = cells.pop("name") registration_number = None for splitter in REG_NRS: if splitter in name: name, registration_number = name.split(splitter, 1) registration_number = registration_number.replace(")", "") country = cells.pop("nationality") country = country.replace("Non ADB Member Country", "") country = country.replace("Rep. of", "") entity = context.make("LegalEntity") entity.id = context.make_id(full_name, country) entity.add("name", name) entity.add("alias", cells.pop("othername_logo")) entity.add("topics", "debarment") entity.add("country", country) entity.add("registrationNumber", registration_number) sanction = h.make_sanction(context, entity) sanction.add("reason", cells.pop("grounds")) sanction.add("program", cells.pop("sanction_type")) date_range = cells.pop("effect_date_lapse_date", "") if "|" in date_range: start_date, end_date = date_range.split("|") sanction.add("startDate", h.parse_date(start_date.strip(), FORMATS)) sanction.add("endDate", h.parse_date(end_date.strip(), FORMATS)) address = h.make_address(context, full=cells.pop("address"), country=country) h.apply_address(context, entity, address) context.emit(entity, target=True) context.emit(sanction)
def parse_common(context: Context, node, entity): sanction = h.make_sanction(context, entity) sanction.add("reason", node.findtext("./BasicInclusion")) sanction.add("program", node.findtext("./CategoryPerson")) inclusion_date = h.parse_date(node.findtext("./DateInclusion"), FORMATS) sanction.add("startDate", inclusion_date) sanction.add("listingDate", inclusion_date) entity.add("createdAt", inclusion_date) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def make_entity(context: Context, el, schema, entity_id): entity = context.make(schema, target=True) entity.id = entity_id entity.add("notes", h.clean_note(el.findtext("./note"))) entity.add("topics", "sanction") sanction = h.make_sanction(context, entity) sanction.add("summary", el.findtext("./correction")) context.emit(sanction) return entity
def parse_common(context, node, entity): entity.id = context.make_slug(node.tag, node.findtext("./Number")) sanction = h.make_sanction(context, entity) sanction.add("reason", node.findtext("./BasicInclusion")) sanction.add("program", node.findtext("./CategoryPerson")) inclusion_date = h.parse_date(node.findtext("./DateInclusion"), FORMATS) sanction.add("startDate", inclusion_date) if inclusion_date is not None: entity.context["created_at"] = inclusion_date entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def parse_entry(context, target, programs, places, updated_at): entity = context.make("LegalEntity") node = target.find("./entity") if node is None: node = target.find("./individual") entity = context.make("Person") if node is None: node = target.find("./object") object_type = node.get("object-type") if object_type != "vessel": context.log.warning("Unknown target type", target=target, object_type=object_type) entity = context.make("Vessel") dates = set() for mod in target.findall("./modification"): date = mod.get("publication-date") if date is not None: dates.add(date) if not len(dates): dates.add(updated_at) entity.context["created_at"] = min(dates) entity.context["updated_at"] = max(dates) entity.id = context.make_slug(target.get("ssid")) entity.add("gender", node.get("sex"), quiet=True) for other in node.findall("./other-information"): value = other.text.strip() if entity.schema.is_a("Vessel") and value.lower().startswith("imo"): _, imo_num = value.split(":", 1) entity.add("imoNumber", imo_num) else: entity.add("notes", value) sanction = make_sanction(context, entity) sanction.add("modifiedAt", max(dates)) for justification in node.findall("./justification"): entity.add("notes", justification.text) ssid = target.get("sanctions-set-id") sanction.add("program", programs.get(ssid)) for identity in node.findall("./identity"): parse_identity(context, entity, identity, places) entity.add("topics", "sanction") context.emit(entity, target=True, unique=True) context.emit(sanction)
def crawl_entity(context, data): nature = data.pop("Nature") schema = SCHEMATA.get(nature) entity = context.make(schema) entity.id = context.make_slug(data.pop("IdRegistre")) entity.add("name", data.pop("Nom")) entity.add("topics", "sanction") sanction = h.make_sanction(context, entity) for detail in data.pop("RegistreDetail"): field = detail.pop("TypeChamp") for value in detail.pop("Valeur"): apply_prop(context, entity, sanction, field, value) context.emit(entity, target=True)
def crawl(context: Context): path = context.fetch_resource("source.json", context.dataset.data.url) context.export_resource(path, JSON, title=context.SOURCE_TITLE) with open(path, "r") as fh: data = json.load(fh) for record in data: bank = context.make("Company") charter_no = record.pop("CharterNumber") bank_name = record.pop("BankName") bank.id = context.make_slug(charter_no, bank_name) bank.add("name", bank_name) bank.add("registrationNumber", charter_no) bank.add("country", "us") bank.add("topics", "fin.bank") if bank.id is not None: context.emit(bank) company_name = record.pop("CompanyName") first_name = record.pop("FirstName") last_name = record.pop("LastName") if company_name: entity = context.make("Company") entity.id = context.make_id(charter_no, bank_name, company_name) entity.add("name", company_name) else: entity = context.make("Person") entity.id = context.make_id(charter_no, bank_name, first_name, last_name) h.apply_name(entity, first_name=first_name, last_name=last_name) entity.add("country", "us") entity.add("topics", "crime.fin") addr = h.make_address( context, city=record.pop("CityName"), state=record.pop("StateName"), country_code="us", ) record.pop("StateAbbreviation") h.apply_address(context, entity, addr) sanction = h.make_sanction(context, entity) sanction.add("startDate", record.pop("CompleteDate", None)) sanction.add("endDate", record.pop("TerminationDate", None)) sanction.add("program", record.pop("EnforcementTypeDescription", None)) sanction.add("authorityId", record.pop("DocketNumber", None)) # context.pprint(record) context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) for node in doc.findall(".//td[@class='tailSTxt']"): if not node.text_content().startswith("2."): continue for item in node.findall(".//tr"): number = item.find(".//td[@class='sProvP1No']").text_content() text = item.findtext(".//td[@class='sProvP1']") text = text.strip().rstrip(";").rstrip(".") name, _ = text.split("(", 1) names = multi_split(name, ["s/o", "@"]) entity = context.make("Person") entity.id = context.make_slug(number, name) entity.add("name", names) entity.add("topics", "sanction") sanction = h.make_sanction(context, entity) sanction.add("program", PROGRAM) for match in IN_BRACKETS.findall(text): # match = match.replace("\xa0", "") res = context.lookup("props", match) if res is not None: for prop, value in res.props.items(): entity.add(prop, value) continue if match.endswith("citizen"): nat = match.replace("citizen", "") entity.add("nationality", nat) continue if match.startswith(DOB): dob = match.replace(DOB, "").strip() entity.add("birthDate", h.parse_date(dob, ["%d %B %Y"])) continue if match.startswith(PASSPORT): passport = match.replace(PASSPORT, "").strip() entity.add("passportNumber", passport) continue context.log.warn("Unparsed bracket term", term=match) context.emit(entity, target=True) context.emit(sanction)
def parse_entry(context: Context, node: _Element): entity_name = node.findtext("./Entity") dob = node.findtext("./DateOfBirth") schedule = node.findtext("./Schedule") if schedule == "N/A": schedule = "" program = node.findtext("./Country") item = node.findtext("./Item") if entity_name is not None: entity = context.make("LegalEntity") entity.add("name", entity_name.split("/")) else: entity = context.make("Person") given_name = node.findtext("./GivenName") last_name = node.findtext("./LastName") entity_name = h.make_name(given_name=given_name, last_name=last_name) entity.add("name", entity_name) entity.add("birthDate", dob) country = program if program is not None and "/" in program: country, _ = program.split("/") entity.add("country", country) entity.id = context.make_slug( schedule, item, entity.first("country"), entity_name, strict=False, ) sanction = h.make_sanction(context, entity) sanction.add("program", program) sanction.add("reason", schedule) sanction.add("authorityId", item) names = node.findtext("./Aliases") if names is not None: for name in names.split(", "): name = collapse_spaces(name) entity.add("alias", name) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) table = doc.find(".//article//table") headers = None for row in table.findall(".//tr"): if headers is None: headers = [ slugify(c.text_content(), "_") for c in row.findall("./td") ] headers = headers[:-2] + ["from", "to"] + headers[-1:] continue cells = [ collapse_spaces(c.text_content()) for c in row.findall("./td") ] cells = dict(zip(headers, cells)) if "prohibited_practice" not in cells: continue name = cells.pop("firm_name") nationality = cells.pop("nationality") entity = context.make("Company") entity.id = context.make_id(name, nationality) entity.add("name", name) entity.add("topics", "debarment") entity.add("country", nationality) sanction = h.make_sanction(context, entity) sanction.add("reason", cells.pop("prohibited_practice")) sanction.add("startDate", h.parse_date(cells.pop("from"), FORMATS)) sanction.add("endDate", h.parse_date(cells.pop("to"), FORMATS)) full = cells.pop("address") address = h.make_address(context, full=full, country=nationality) h.apply_address(context, entity, address) context.emit(entity, target=True) context.emit(sanction)
def parse_common(context, entity, node): entity.id = context.make_slug(node.findtext("./DATAID")) name = node.findtext("./NAME_ORIGINAL_SCRIPT") name = name or node.findtext("./FIRST_NAME") entity.add("name", name) entity.add("notes", node.findtext("./COMMENTS1")) entity.add("topics", "sanction") updated_at = values(node.find("./LAST_DAY_UPDATED")) if len(updated_at): entity.add("modifiedAt", updated_at) entity.context["updated_at"] = max(updated_at) listed_on = node.findtext("./LISTED_ON") if listed_on is not None: entity.context["created_at"] = listed_on sanction = make_sanction(context, entity) sanction.add("startDate", listed_on) sanction.add("modifiedAt", values(node.find("./LAST_DAY_UPDATED"))) sanction.add("program", node.findtext("./UN_LIST_TYPE")) sanction.add("recordId", node.findtext("./REFERENCE_NUMBER")) return sanction
def crawl_individuals(context: Context): path = context.fetch_resource("individuals.xlsx", PEOPLE_URL) context.export_resource(path, XLSX, title=context.SOURCE_TITLE) for record in excel_records(path): seq_id = record.pop("internal_seq_id", None) if seq_id is None: continue name_en = record.pop("name_of_individual_english", None) name_he = record.pop("name_of_individual_hebrew", None) name_ar = record.pop("name_of_individual_arabic", None) entity = context.make("Person") entity.id = context.make_id(name_en, name_he, name_ar) if entity.id is None: continue entity.add("name", name_en or name_he or name_ar) entity.add("alias", name_he) entity.add("alias", name_ar) entity.add("topics", "crime.terror") entity.add("birthDate", parse_date(record.pop("d_o_b", None))) entity.add("nationality", record.pop("nationality_residency", None)) entity.add("idNumber", record.pop("individual_id", None)) sanction = h.make_sanction(context, entity) sanction.add("recordId", seq_id) sanction.add("recordId", record.pop("foreign_designation_id", None)) sanction.add("program", record.pop("designation", None)) sanction.add("program", record.pop("foreign_designation", None)) sanction.add("authority", lang_pick(record, "designated_by")) lang_pick(record, "designated_by_abroad") record.pop("date_of_foreign_designation_date", None) for field in ("date_of_designation_in_israel",): sanction.add("startDate", parse_date(record.pop(field, None))) context.emit(entity, target=True) context.emit(sanction) if len(record): context.pprint(record)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) table = doc.find('.//table[@id="datatable-1"]') headers = None for row in table.findall(".//tr"): if headers is None: headers = [slugify(c.text, "_") for c in row.findall("./th")] continue cells = [collapse_spaces(c.text) for c in row.findall("./td")] cells = dict(zip(headers, cells)) # AfDB lists several individuals as firms in places where the IADB # shows them to be people (and they have normal personal names) # type_ = cells.pop("type") # schema = context.lookup_value("types", type_) # if schema is None: # context.log.error("Unknown entity type", type=type_) # continue name = cells.pop("name") country = cells.pop("nationality") entity = context.make("LegalEntity") entity.id = context.make_id(name, country) entity.add("name", name) entity.add("topics", "debarment") entity.add("country", country) sanction = h.make_sanction(context, entity) sanction.add("reason", cells.pop("basis")) sanction.add("startDate", parse_date(cells.pop("from"))) sanction.add("endDate", parse_date(cells.pop("to"))) context.emit(entity, target=True) context.emit(sanction)
def parse_common(context: Context, entity, node): entity.id = context.make_slug(node.findtext("./DATAID")) h.apply_name( entity, given_name=node.findtext("./FIRST_NAME"), second_name=node.findtext("./SECOND_NAME"), name3=node.findtext("./THIRD_NAME"), name4=node.findtext("./FOURTH_NAME"), quiet=True, ) entity.add("alias", node.findtext("./NAME_ORIGINAL_SCRIPT")) entity.add("notes", h.clean_note(node.findtext("./COMMENTS1"))) entity.add("topics", "sanction") sanction = h.make_sanction(context, entity) entity.add("createdAt", node.findtext("./LISTED_ON")) sanction.add("listingDate", node.findtext("./LISTED_ON")) sanction.add("startDate", node.findtext("./LISTED_ON")) sanction.add("modifiedAt", values(node.find("./LAST_DAY_UPDATED"))) entity.add("modifiedAt", values(node.find("./LAST_DAY_UPDATED"))) sanction.add("program", node.findtext("./UN_LIST_TYPE")) sanction.add("unscId", node.findtext("./REFERENCE_NUMBER")) return sanction
def parse_sanctions(context: Context, entity: Entity, entry): regulations = entry.findall("./regulation") # if len(regulations) == 0: # context.log.warning( # "No regulations on entity", # entity=entity, # regulations=len(regulations), # ) for regulation in regulations: url = regulation.findtext("./publicationUrl") assert url is not None, etree.tostring(regulation) sanction = h.make_sanction(context, entity, key=url) sanction.set("sourceUrl", url) sanction.add("program", regulation.get("programme")) sanction.add("reason", regulation.get("numberTitle")) sanction.add("startDate", regulation.get("entryIntoForceDate")) sanction.add("listingDate", regulation.get("publicationDate")) entity.add("createdAt", regulation.get("publicationDate")) sanction.add("unscId", entry.get("unitedNationId")) sanction.add("authorityId", entry.get("euReferenceNumber")) context.emit(sanction)
def crawl_entity(context: Context, data): # context.pprint(data) nature = data.pop("Nature") schema = SCHEMATA.get(nature) if schema is None: context.log.error("Unknown entity type", nature=nature) return entity = context.make(schema) entity.id = context.make_slug(data.pop("IdRegistre")) sanction = h.make_sanction(context, entity) for detail in data.pop("RegistreDetail"): field = detail.pop("TypeChamp") for value in detail.pop("Valeur"): apply_prop(context, entity, sanction, field, value) name = data.pop("Nom") h.apply_name( entity, first_name=entity.first("firstName", quiet=True), tail_name=name, quiet=True, ) entity.add("topics", "sanction") context.emit(entity, target=True)
def crawl(context: Context): for page in count(1): url = str(context.dataset.data.url) url = url.replace("pPageNumber=1", "pPageNumber=%s" % page) headers = { "Accept": "application/json", "Referer": "https://www.iadb.org/en/transparency/sanctioned-firms-and-individuals", } res = context.http.get(url, headers=headers) ids = [] for row in res.json(): for field, value in list(row.items()): if value == "N/A": row[field] = "" row_id = row.pop("id") ids.append(row_id) entity_type = row.pop("entity") schema = context.lookup_value("types", entity_type) if schema is None: context.log.warning("Unknown entity type", entity=entity_type) continue entity = context.make(schema) entity.id = context.make_slug(row_id) entity.add("name", row.pop("firmName")) entity.add("topics", "debarment") entity.add("alias", row.pop("additionalName")) entity.add("notes", row.pop("title")) entity.add("notes", row.pop("additionalTitle")) entity.add("country", parse_countries(row.pop("country"))) nat = "nationality" if schema == "Company": nat = "jurisdiction" entity.add(nat, parse_countries(row.pop("nationality"))) affiliated = row.pop("affiliatedWithEntityId") if len(affiliated): link = context.make("UnknownLink") link.id = context.make_id(row_id, affiliated) link.add("subject", entity.id) link.add("object", context.make_slug(affiliated)) context.emit(link) sanction = h.make_sanction(context, entity) sanction.add("status", row.pop("statusName")) sanction.add("reason", row.pop("grounds")) sanction.add("authority", row.pop("source")) sanction.add("authority", row.pop("idBinstSource")) sanction.add("program", row.pop("idBinstType")) sanction.add("startDate", h.parse_date(row.pop("datefrom"), FORMATS)) sanction.add("endDate", h.parse_date(row.pop("dateto"), FORMATS)) # context.pprint(row) context.emit(sanction) context.emit(entity, target=True) if min(ids) == 1: return