def parse_reference(context: Context, reference: int, rows): schemata = set() for row in rows: type_ = row.pop("type") schema = context.lookup_value("type", type_) if schema is None: context.log.warning("Unknown entity type", type=type_) return schemata.add(schema) assert len(schemata) == 1, schemata entity = context.make(schemata.pop()) primary_name = None for row in rows: name = row.pop("name_of_individual_or_entity", None) name_type = row.pop("name_type") name_prop = context.lookup_value("name_type", name_type) if name_prop is None: context.log.warning("Unknown name type", name_type=name_type) return entity.add(name_prop, name) if name_prop == "name": primary_name = name entity.id = context.make_slug(reference, primary_name) sanction = h.make_sanction(context, entity) primary_name = None for row in rows: addr = row.pop("address") if addr is not None: for part in multi_split(addr, SPLITS): address = h.make_address(context, full=part) h.apply_address(context, entity, address) sanction.add("program", row.pop("committees")) citizen = multi_split(row.pop("citizenship"), ["a)", "b)", "c)", "d)"]) entity.add("nationality", citizen, quiet=True) dates = clean_date(row.pop("date_of_birth")) entity.add("birthDate", dates, quiet=True) entity.add("birthPlace", row.pop("place_of_birth"), quiet=True) entity.add("notes", h.clean_note(row.pop("additional_information"))) listing_info = row.pop("listing_information") if isinstance(listing_info, datetime): entity.add("createdAt", listing_info) sanction.add("listingDate", listing_info) else: sanction.add("summary", listing_info) # TODO: consider parsing if it's not a datetime? control_date = row.pop("control_date") sanction.add("startDate", control_date) entity.add("createdAt", control_date) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def parse_date(date): if date is not None: date = date.replace(" ", "") dates = set() for part in multi_split(date, [",", "\n", ";"]): dates.update(h.parse_date(part, ["%d.%m.%Y", "dd.%m.%Y"])) return dates
def parse_reference(context, reference, rows): entity = context.make("LegalEntity") entity.id = context.make_slug(reference) # entity.add("sourceUrl", context.dataset.url) sanction = h.make_sanction(context, entity) for row in rows: if row.pop("type") == "Individual": entity.schema = model.get("Person") name = row.pop("name_of_individual_or_entity", None) if row.pop("name_type") == "aka": entity.add("alias", name) else: entity.add("name", name) address = h.make_address(context, full=row.pop("address")) h.apply_address(context, entity, address) sanction.add("program", row.pop("committees")) citizen = multi_split(row.pop("citizenship"), ["a)", "b)", "c)", "d)"]) entity.add("nationality", citizen, quiet=True) dates = clean_date(row.pop("date_of_birth")) entity.add("birthDate", dates, quiet=True) entity.add("birthPlace", row.pop("place_of_birth"), quiet=True) entity.add("notes", row.pop("additional_information")) entity.add("notes", row.pop("listing_information"), quiet=True) control_date = row.pop("control_date") sanction.add("modifiedAt", control_date) entity.add("modifiedAt", control_date) entity.context["updated_at"] = control_date.isoformat() entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def clean_date(date): splits = [ "a)", "b)", "c)", "d)", "e)", "f)", "g)", "h)", "i)", " or ", " to ", " and ", "alt DOB:", "alt DOB", ";", ">>", ] dates = set() if isinstance(date, float): date = str(int(date)) if isinstance(date, datetime): date = date.date().isoformat() date = remove_bracketed(date) if date is None: return dates date = date.replace("\n", " ") for part in multi_split(date, splits): part = part.strip().strip(",") if not len(part): continue dates.update(h.parse_date(part, FORMATS)) return dates
def parse_date(text: List[Optional[str]]) -> List[str]: dates: List[str] = [] for date in multi_split(text, DATE_SPLITS): cleaned = DATE_CLEAN.sub("", date) normal = decompose_nfkd(cleaned) for parsed in h.parse_date(normal, FORMATS, default=date): dates.append(parsed) return dates
def crawl_physical(context: Context) -> None: data = json_resource(context, PHYSICAL_URL, "physical") for row in data: entity = context.make("Person") entity.id = context.make_slug(row.pop("ukaz_id"), row.pop("index")) entity.add("name", row.pop("name_ukr", None)) entity.add("name", row.pop("name_original", None)) for alias in multi_split(row.pop("name_alternative", None), [";", "/"]): entity.add("alias", alias) entity.add("notes", row.pop("additional", None)) for country in multi_split(row.pop("citizenship", None), [", "]): entity.add("nationality", country) entity.add("birthDate", row.pop("birthdate", None)) entity.add("birthPlace", row.pop("birthplace", None)) entity.add("position", row.pop("occupation", None)) handle_address(context, entity, row.pop("livingplace", None)) handle_sanction(context, entity, row) context.emit(entity, target=True)
def crawl(context: Context): xls_url = fetch_xls_url(context) path = context.fetch_resource("source.xls", xls_url) context.export_resource(path, XLS, title=context.SOURCE_TITLE) xls = xlrd.open_workbook(path) for sheet in xls.sheets(): headers = None row0 = [h.convert_excel_cell(xls, c) for c in sheet.row(0)] sections = [c for c in row0 if c is not None] section = collapse_spaces(" / ".join(sections)) for r in range(1, sheet.nrows): row = [h.convert_excel_cell(xls, c) for c in sheet.row(r)] # after a header is found, read normal data: if headers is not None: data: Dict[str, List[str]] = {} for header, cell in zip(headers, row): if header is None: continue values = [] if isinstance(cell, datetime): cell = cell.date() for value in multi_split(stringify(cell), SPLITS): if value is None: continue if value == "不明": continue if value is not None: values.append(value) data[header] = values emit_row(context, sheet.name, section, data) if not len(row) or row[0] is None: continue teaser = row[0].strip() # the first column of the common headers: if "告示日付" in teaser: if headers is not None: context.log.error("Found double header?", row=row) # print("SHEET", sheet, row) headers = [] for cell in row: cell = collapse_spaces(cell) header = context.lookup_value("columns", cell) if header is None: context.log.warning("Unknown column title", column=cell, sheet=sheet.name) headers.append(header)
def crawl(context: Context): path = context.fetch_resource("source.html", context.dataset.data.url) context.export_resource(path, HTML, title=context.SOURCE_TITLE) with open(path, "r") as fh: doc = html.parse(fh) for node in doc.findall(".//td[@class='tailSTxt']"): if not node.text_content().startswith("2."): continue for item in node.findall(".//tr"): number = item.find(".//td[@class='sProvP1No']").text_content() text = item.findtext(".//td[@class='sProvP1']") text = text.strip().rstrip(";").rstrip(".") name, _ = text.split("(", 1) names = multi_split(name, ["s/o", "@"]) entity = context.make("Person") entity.id = context.make_slug(number, name) entity.add("name", names) entity.add("topics", "sanction") sanction = h.make_sanction(context, entity) sanction.add("program", PROGRAM) for match in IN_BRACKETS.findall(text): # match = match.replace("\xa0", "") res = context.lookup("props", match) if res is not None: for prop, value in res.props.items(): entity.add(prop, value) continue if match.endswith("citizen"): nat = match.replace("citizen", "") entity.add("nationality", nat) continue if match.startswith(DOB): dob = match.replace(DOB, "").strip() entity.add("birthDate", h.parse_date(dob, ["%d %B %Y"])) continue if match.startswith(PASSPORT): passport = match.replace(PASSPORT, "").strip() entity.add("passportNumber", passport) continue context.log.warn("Unparsed bracket term", term=match) context.emit(entity, target=True) context.emit(sanction)
def crawl_legal(context: Context) -> None: data = json_resource(context, LEGAL_URL, "legal") for row in data: entity = context.make("Organization") entity.id = context.make_slug(row.pop("ukaz_id"), row.pop("index")) entity.add("name", row.pop("name_ukr", None)) entity.add("name", row.pop("name_original", None)) for alias in multi_split(row.pop("name_alternative", None), [";", "/"]): entity.add("alias", alias) entity.add("notes", row.pop("additional", None)) ipn = row.pop("ipn", "") or "" entity.add("taxNumber", ipn.replace("ІПН", "")) odrn = row.pop("odrn_edrpou", "") or "" entity.add("registrationNumber", odrn.replace("ОДРН", "")) handle_address(context, entity, row.pop("place", None)) handle_address(context, entity, row.pop("place_alternative", None)) handle_sanction(context, entity, row) # context.pprint(row) context.emit(entity, target=True)
def parse_date(date): dates = [] for part in multi_split(date, ["OR", ";", " - "]): dates.extend(h.parse_date(part, FORMATS)) return dates
def parse_countries(text): countries = set() for country in multi_split(text, COUNTRY_SPLIT): country = remove_bracketed(country) countries.add(country) return countries
def split_new(text): # It's 2022 and they can't multi-value a thing... return multi_split(text, [". ", ", "])
def clean_phones(phones): out = [] for phone in multi_split(phones, SPLITS): phone = REMOVE.sub("", phone) out.append(phone) return out
def split_names(names): return multi_split(names, ["\n", ", "])
def parse_entry(context: Context, entry): entity = context.make("LegalEntity") if entry.findtext("./type-entry") == "2": entity = context.make("Person") entry_id = entry.findtext("number-entry") entity.id = context.make_slug(entry_id) sanction = h.make_sanction(context, entity) sanction.add("program", entry.findtext("./program-entry")) date_entry = entry.findtext("./date-entry") if date_entry: date = datetime.strptime(date_entry, "%Y%m%d") entity.add("createdAt", date.date()) sanction.add("listingDate", date.date()) sanction.add("startDate", date.date()) for aka in entry.findall("./aka-list"): h.apply_name( entity, name1=aka.findtext("./aka-name1"), name2=aka.findtext("./aka-name2"), name3=aka.findtext("./aka-name3"), tail_name=aka.findtext("./aka-name4"), alias=aka.findtext("type-aka") != "N", is_weak=aka.findtext("./quality-aka") == "2", quiet=True, ) for node in entry.findall("./title-list"): entity.add("title", node.text, quiet=True) for doc in entry.findall("./document-list"): reg = doc.findtext("./document-reg") number = doc.findtext("./document-id") country = doc.findtext("./document-country") passport = context.make("Passport") passport.id = context.make_id("Passport", entity.id, reg, number, country) passport.add("holder", entity) passport.add("passportNumber", number) passport.add("summary", reg) passport.add("country", country) context.emit(passport) for doc in entry.findall("./id-number-list"): entity.add("idNumber", doc.text) for node in entry.findall("./address-list"): address = h.make_address(context, full=node.findtext("./address")) h.apply_address(context, entity, address) for pob in entry.findall("./place-of-birth-list"): entity.add_cast("Person", "birthPlace", pob.text) for dob in entry.findall("./date-of-birth-list"): date = parse_date(dob.text) entity.add_cast("Person", "birthDate", date) for nat in entry.findall("./nationality-list"): for country in multi_split(nat.text, [";", ","]): country = remove_bracketed(country) entity.add("nationality", country, quiet=True) entity.add("topics", "sanction") context.emit(entity, target=True) context.emit(sanction)
def letter_split(text): return multi_split(text, SPLITS)
def parse_entry(context, entry): entity = context.make("LegalEntity") if entry.findtext("./type-entry") == "2": entity = context.make("Person") entry_id = entry.findtext("number-entry") entity.id = context.make_slug(entry_id) sanction = h.make_sanction(context, entity) sanction.add("program", entry.findtext("./program-entry")) date_entry = entry.findtext("./date-entry") if date_entry: date = datetime.strptime(date_entry, "%Y%m%d") entity.context["created_at"] = date.isoformat() sanction.add("startDate", date.date()) for aka in entry.findall("./aka-list"): first_name = aka.findtext("./aka-name1") entity.add("firstName", first_name, quiet=True) second_name = aka.findtext("./aka-name2") entity.add("secondName", second_name, quiet=True) third_name = aka.findtext("./aka-name3") entity.add("middleName", third_name, quiet=True) last_name = aka.findtext("./aka-name4") entity.add("lastName", last_name, quiet=True) name = jointext(first_name, second_name, third_name, last_name) if aka.findtext("type-aka") == "N": entity.add("name", name) else: if aka.findtext("./quality-aka") == "2": entity.add("weakAlias", name) else: entity.add("alias", name) for node in entry.findall("./title-list"): entity.add("title", node.text, quiet=True) for doc in entry.findall("./document-list"): reg = doc.findtext("./document-reg") number = doc.findtext("./document-id") country = doc.findtext("./document-country") passport = context.make("Passport") passport.id = context.make_id("Passport", entity.id, reg, number, country) passport.add("holder", entity) passport.add("passportNumber", number) passport.add("summary", reg) passport.add("country", country) context.emit(passport) for doc in entry.findall("./id-number-list"): entity.add("idNumber", doc.text) for node in entry.findall("./address-list"): address = h.make_address(context, full=node.findtext("./address")) h.apply_address(context, entity, address) for pob in entry.findall("./place-of-birth-list"): entity.add_cast("Person", "birthPlace", pob.text) for dob in entry.findall("./date-of-birth-list"): date = parse_date(dob.text) entity.add_cast("Person", "birthDate", date) for nat in entry.findall("./nationality-list"): for country in multi_split(nat.text, [";", ","]): country = remove_bracketed(country) entity.add("nationality", country, quiet=True) entity.add("topics", "sanction") context.emit(entity, target=True, unique=True) context.emit(sanction)