Пример #1
0
def parse_party(context: Context, distinct_party, locations, documents):
    profile = distinct_party.find("Profile")
    sub_type = ref_get("PartySubType", profile.get("PartySubTypeID"))
    schema = TYPES.get(sub_type.get("Value"))
    type_ = ref_value("PartyType", sub_type.get("PartyTypeID"))
    schema = TYPES.get(type_, schema)
    if schema is None:
        context.log.error("Unknown party type", value=type_)
        return
    party = context.make(schema)
    party.id = context.make_slug(profile.get("ID"))
    party.add("notes", h.clean_note(distinct_party.findtext("Comment")))
    party.add("sourceUrl", URL % profile.get("ID"))

    for identity in profile.findall("./Identity"):
        parts = {}
        for group in identity.findall(".//NamePartGroup"):
            type_id = group.get("NamePartTypeID")
            parts[group.get("ID")] = ref_value("NamePartType", type_id)

        for alias in identity.findall("./Alias"):
            parse_alias(party, parts, alias)

        for regdoc in documents.get(identity.get("ID"), []):
            parse_registration_doc(context, party, regdoc)

    for feature in profile.findall("./Feature"):
        parse_feature(context, feature, party, locations)

    context.emit(party, target=True)
    # pprint(party.to_dict())
    # context.log.info("[%s] %s" % (party.schema.name, party.caption))
    return party
Пример #2
0
def parse_entry(context: Context, entry, parties):
    party_id = context.make_slug(entry.get("ProfileID"))
    party = parties[party_id]

    sanction = h.make_sanction(context, party, key=entry.get("ID"))
    sanction.add("program", ref_value("List", entry.get("ListID")))

    for event in entry.findall("./EntryEvent"):
        date = parse_date(event.find("./Date"))
        party.add("createdAt", date)
        sanction.add("summary", event.findtext("./Comment"))
        basis = ref_value("LegalBasis", event.get("LegalBasisID"))
        sanction.add("reason", basis)

    party.add("topics", "sanction")
    sanction.add("listingDate", party.get("createdAt"))
    sanction.add("startDate", party.get("modifiedAt"))

    for measure in entry.findall("./SanctionsMeasure"):
        sanction.add("summary", measure.findtext("./Comment"))
        type_id = measure.get("SanctionsTypeID")
        sanction.add("program", ref_value("SanctionsType", type_id))

    context.emit(sanction)
    context.emit(party, target=True)
Пример #3
0
def crawl(context: Context):
    path = context.fetch_resource("source.xml", context.dataset.data.url)
    context.export_resource(path, "text/xml", title=context.SOURCE_TITLE)
    doc = context.parse_resource_xml(path)
    doc = h.remove_namespace(doc)
    for entry in doc.findall(".//sanctionEntity"):
        parse_entry(context, entry)
Пример #4
0
def crawl_person(context: Context) -> None:
    data = json_resource(context, context.dataset.data.url, "person")
    for row in data["data"]:
        row = clean_row(row)
        person_id = row.pop("person_id")
        name_en = row.pop("name_en", None)
        name_ru = row.pop("name_ru", None)
        name_uk = row.pop("name_uk", None)
        name = name_en or name_ru or name_uk
        entity = context.make("Person")
        entity.id = context.make_slug("person", person_id, name)
        entity.add("name", name)
        entity.add("alias", name_ru)
        entity.add("alias", name_uk)
        entity.add("birthDate", parse_date(row.pop("date_bd", None)))
        url = "https://sanctions.nazk.gov.ua/sanction-person/%s/"
        entity.add("sourceUrl", url % person_id)
        if row.get("city_bd_en") != "N/A":
            entity.add("birthPlace", row.pop("city_bd_en", None))
            entity.add("birthPlace", row.pop("city_bd_ru", None))
            entity.add("birthPlace", row.pop("city_bd_uk", None))
        entity.add("position", row.pop("position_en", None))
        entity.add("position", row.pop("position_ru", None))
        entity.add("position", row.pop("position_uk", None))
        entity.add("notes", row.pop("reasoning_en", None))
        entity.add("notes", row.pop("reasoning_ru", None))
        entity.add("notes", row.pop("reasoning_uk", None))

        country = row.get("country", None)
        entity.add("country", COUNTRIES[country])
        entity.add("topics", "sanction")
        context.emit(entity, target=True)
Пример #5
0
def crawl_company(context: Context) -> None:
    data = json_resource(context, context.dataset.data.url, "company")
    for row in data["data"]:
        row = clean_row(row)
        # context.pprint(row)
        company_id = row.pop("company_id")
        name_en = row.pop("name_en", None)
        name = row.pop("name", None) or name_en
        entity = context.make("Organization")
        entity.id = context.make_slug("company", company_id, name)
        if entity.id is None:
            entity.id = context.make_slug(
                "company",
                company_id,
                row.pop("ogrn", None),
                strict=False,
            )
        entity.add("name", name)
        entity.add("name", name_en)
        entity.add("name", row.pop("name_uk", None))
        entity.add("name", row.pop("name_ru", None))
        entity.add("innCode", row.pop("inn", None))
        entity.add_cast("Company", "ogrnCode", row.pop("ogrn", None))

        country = row.pop("country", None)
        entity.add("country", COUNTRIES[country])
        entity.add("topics", "sanction")
        entity.add("notes", row.pop("reasoning_en", None))
        entity.add("notes", row.pop("reasoning_ru", None))
        entity.add("notes", row.pop("reasoning_uk", None))

        context.emit(entity, target=True)
        row.pop("logo_en", None)
Пример #6
0
def crawl_common(context: Context, data: Dict[str, str], part: str,
                 schema: str):
    entity = context.make(schema)
    entity.id = context.make_slug(part, data.pop("DATAID"))
    entity.add("topics", "sanction")
    entity.add("notes", h.clean_note(data.pop("COMMENTS1")))
    entity.add("notes", h.clean_note(data.pop("NOTE", None)))
    entity.add("alias", data.pop("NAME_ORIGINAL_SCRIPT"))

    h.apply_name(
        entity,
        name1=data.pop("FIRST_NAME", None),
        name2=data.pop("SECOND_NAME", None),
        name3=data.pop("THIRD_NAME", None),
        name4=data.pop("FOURTH_NAME", None),
        quiet=True,
    )

    sanction = h.make_sanction(context, entity)
    submitted_on = parse_date(data.pop("SUBMITTED_ON", None))
    listed_on = parse_date(data.pop("LISTED_ON"))
    modified_at = parse_date(data.pop("LAST_DAY_UPDATED"))
    entity.add("createdAt", submitted_on or listed_on or modified_at)
    entity.add("modifiedAt", modified_at)

    sanction.add("listingDate", submitted_on or listed_on)
    sanction.add("startDate", listed_on)
    sanction.add("program", data.pop("UN_LIST_TYPE"))
    sanction.add("program", data.pop("LIST_TYPE"))
    sanction.add("unscId", data.pop("REFERENCE_NUMBER"))
    sanction.add("authority", data.pop("SUBMITTED_BY", None))
    context.emit(sanction)
    return entity
Пример #7
0
def crawl_legislature(context: Context, country, legislature):
    lastmod_ = int(legislature.get("lastmod"))
    lastmod = datetime.utcfromtimestamp(lastmod_)

    url = legislature.get("popolo_url")
    # this isn't being updated, hence long interval:
    data = context.fetch_json(url, cache_days=30)

    persons: Dict[str, Optional[str]] = {}
    for person in data.pop("persons", []):
        pid = person.get("id")
        persons[pid] = parse_person(context, person, country, lastmod)

    organizations: Dict[str, Optional[str]] = {}
    for org in data.pop("organizations", []):
        org_id = org.pop("id", None)
        org_id = context.lookup_value("org_id", org_id, org_id)
        if org_id is None:
            continue

        name = org.pop("name", org.pop("sort_name", None))
        organizations[org_id] = name

    events = data.pop("events", [])
    events = {e.get("id"): e for e in events}

    for membership in data.pop("memberships", []):
        parse_membership(context, membership, persons, organizations, events)
Пример #8
0
def crawl(context: Context):
    path = context.fetch_resource("source.json", context.dataset.data.url)
    context.export_resource(path, JSON, title=context.SOURCE_TITLE)
    with open(path, "r") as file:
        data = json.load(file)
        for result in data.get("results"):
            parse_result(context, result)
Пример #9
0
def parse_person(context: Context, data, country, lastmod):
    person_id = data.pop("id", None)
    person = context.make("Person")
    person.id = context.make_slug(person_id)
    person.add("nationality", country)
    name = data.get("name")
    if name is None or name.lower().strip() in ("unknown", ):
        return
    person.add("modifiedAt", lastmod.date())
    person.add("name", data.pop("name", None))
    person.add("alias", data.pop("sort_name", None))
    for other in data.pop("other_names", []):
        person.add("alias", other.get("name"))
    person.add("gender", data.pop("gender", None))
    person.add("title", data.pop("honorific_prefix", None))
    person.add("title", data.pop("honorific_suffix", None))
    person.add("firstName", data.pop("given_name", None))
    person.add("lastName", data.pop("family_name", None))
    person.add("fatherName", data.pop("patronymic_name", None))
    person.add("birthDate", data.pop("birth_date", None))
    person.add("deathDate", data.pop("death_date", None))
    person.add("email", h.clean_emails(data.pop("email", None)))
    person.add("notes", data.pop("summary", None))
    person.add("topics", "role.pep")

    for link in data.pop("links", []):
        url = link.get("url")
        if link.get("note") in ("website", "blog", "twitter", "facebook"):
            person.add("website", url)
        # elif "Wikipedia (" in link.get("note") and "wikipedia.org" in url:
        #     person.add("wikipediaUrl", url)
        # elif "wikipedia" in link.get("note") and "wikipedia.org" in url:
        #     person.add("wikipediaUrl", url)
        # else:
        #     person.log.info("Unknown URL", url=url, note=link.get("note"))

    for ident in data.pop("identifiers", []):
        identifier = ident.get("identifier")
        scheme = ident.get("scheme")
        if scheme == "wikidata" and identifier.startswith("Q"):
            person.add("wikidataId", identifier)

    for contact_detail in data.pop("contact_details", []):
        value = contact_detail.get("value")
        if "email" == contact_detail.get("type"):
            person.add("email", h.clean_emails(value))
        if "phone" == contact_detail.get("type"):
            person.add("phone", h.clean_phones(value))

    if check_person_cutoff(person):
        return

    # data.pop("image", None)
    # data.pop("images", None)
    # if len(data):
    #     pprint(data)
    context.emit(person, target=True)
    # entities[person_id] = person.id
    return person.id
Пример #10
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r", encoding="utf-8") as fh:
        doc = html.fromstring(fh.read())
    for table in doc.findall('.//div[@class="editor-content"]//table'):
        headers = None
        schema = None
        for row in table.findall(".//tr"):
            cells = [
                collapse_spaces(c.text_content()) for c in row.findall("./td")
            ]
            if headers is None:
                headers = [slugify(c, sep="_") for c in cells]
                continue
            if len(cells) == 1:
                schema = TYPES[cells[0]]
                continue
            row = dict(zip(headers, cells))

            entity = context.make(schema)
            name = row.pop("imie_i_nazwisko_nazwa_podmiotu")
            entity.id = context.make_slug(name)
            names = name.split("(")
            entity.add("name", names[0])
            for alias in names[1:]:
                entity.add("alias", alias.split(")")[0])
            notes = row.pop("uzasadnienie_wpisu_na_liste")
            entity.add("notes", notes)

            details = row.pop("dane_identyfikacyjne_osoby_podmiotu")
            for (chop, prop) in CHOPSKA:
                parts = details.rsplit(chop, 1)
                details = parts[0]
                if len(parts) > 1:
                    if prop == "address":
                        addr = h.make_address(context, full=parts[1])
                        h.apply_address(context, entity, addr)
                    else:
                        entity.add(prop, parts[1])
            if len(details.strip()):
                result = context.lookup("details", details)
                if result is None:
                    context.log.warning("Unhandled details", details=details)
                else:
                    for prop, value in result.props.items():
                        entity.add(prop, value)

            sanction = h.make_sanction(context, entity)
            provisions = row.pop("zastosowane_srodki_sankcyjne")
            sanction.add("provisions", provisions)

            start_date = row.pop("data_umieszczenia_na_liscie")
            start_date = start_date.replace(" r.", "")
            sanction.add("startDate", h.parse_date(start_date, ["%d.%m.%Y"]))

            h.audit_data(row)
            context.emit(entity, target=True)
            context.emit(sanction)
Пример #11
0
def crawl(context: Context):
    path = context.fetch_resource("source.xml", context.dataset.data.url)
    context.export_resource(path, XML, title=context.SOURCE_TITLE)
    doc = context.parse_resource_xml(path)
    doc = h.remove_namespace(doc)

    for el in doc.findall(".//FinancialSanctionsTarget"):
        parse_row(context, make_row(el))
Пример #12
0
def run_enrich(scope_name: str, external_name: str, threshold: float):
    scope = Dataset.require(scope_name)
    external = Dataset.require(external_name)
    ctx = Context(external)
    resolver = get_resolver()
    database = Database(scope, resolver, cached=False)
    loader = database.view(scope)
    ctx.enrich(resolver, loader, threshold=threshold)
    resolver.save()
Пример #13
0
def crawl(context: Context):
    path = context.fetch_resource("source.xml", context.dataset.data.url)
    context.export_resource(path, "text/xml", title=context.SOURCE_TITLE)
    doc = context.parse_resource_xml(path)

    for node in doc.findall(".//INDIVIDUAL"):
        parse_individual(context, node)

    for node in doc.findall(".//ENTITY"):
        parse_entity(context, node)
Пример #14
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r") as fh:
        doc = html.parse(fh)

    for sec_id, (section, schema) in SECTIONS.items():
        el = doc.find(".//div[@id='%s']" % sec_id)
        for item in el.findall(".//li"):
            text = item.text_content().strip()
            index, text = text.split(".", 1)
            text = text.strip()
            if text.endswith(";"):
                text = text.rstrip(";")
            entity = context.make(schema)
            entity.id = context.make_id(text)
            sanction = h.make_sanction(context, entity)
            sanction.add("program", section)
            sanction.add("recordId", index)
            if sec_id == "russianUL":
                parse_russian_orgs(context, entity, text)
            if sec_id == "russianFL":
                parse_russian_persons(context, entity, text)
            if sec_id == "foreignUL":
                parse_foreign_orgs(context, entity, text)
            if sec_id == "foreignFL":
                parse_foreign_persons(context, entity, text)

            if entity.has("name"):
                context.emit(entity, target=True)
                context.emit(sanction)
Пример #15
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r") as fh:
        doc = html.parse(fh)

    for table in doc.findall("//table"):
        headers = table.findall("./thead/tr/td")
        headers = [h.text_content() for h in headers]
        assert "Vendor name" in headers, headers
        assert "From" in headers, headers
        for row in table.findall("./tbody/tr"):
            cells = [h.text_content() for h in row.findall("./td")]
            if len(cells[0]) == 0:
                continue
            entity = context.make("LegalEntity")
            entity.id = context.make_id(*cells)
            entity.add("name", cells[0])
            entity.add("country", cells[1])
            entity.add("topics", "crime.fraud")

            cc = entity.first("country")
            address = h.make_address(context, full=cells[2], country_code=cc)
            h.apply_address(context, entity, address)

            sanction = h.make_sanction(context, entity)
            sanction.add("reason", cells[3])
            sanction.add("program", cells[4])
            sanction.add("startDate", parse_date(cells[5]))
            sanction.add("endDate", parse_date(cells[6]))

            context.emit(sanction)
            context.emit(entity, target=True)
Пример #16
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r") as fh:
        doc = html.parse(fh)
    table = doc.find('//div[@class="sanctioned-table"]/table')
    headers = None
    for row in table.findall(".//tr"):
        if headers is None:
            headers = [slugify(el.text) for el in row.findall("./th")]
            continue
        cells = [collapse_spaces(el.text) for el in row.findall("./td")]
        data = {hdr: c for hdr, c in zip(headers, cells)}

        entity = context.make("Person")
        entity.id = context.make_id(data["id"], data["ad-soyad-ata-adi"])
        entity.add("name", data["ad-soyad-ata-adi"])
        entity.add("idNumber", data["id"])
        entity.add("birthDate", parse_date(data["dogum-tarixi"]))
        entity.add("country", "az")
        entity.add("topics", "sanction")

        addr = h.make_address(context, full=data["malumat"])
        h.apply_address(context, entity, addr)

        sanction = h.make_sanction(context, entity)
        context.emit(sanction)
        context.emit(entity, target=True)
Пример #17
0
def parse_common(context: Context, node, entity):
    sanction = h.make_sanction(context, entity)
    sanction.add("reason", node.findtext("./BasicInclusion"))
    sanction.add("program", node.findtext("./CategoryPerson"))
    inclusion_date = h.parse_date(node.findtext("./DateInclusion"), FORMATS)
    sanction.add("startDate", inclusion_date)
    sanction.add("listingDate", inclusion_date)
    entity.add("createdAt", inclusion_date)
    entity.add("topics", "sanction")
    context.emit(entity, target=True)
    context.emit(sanction)
Пример #18
0
def crawl_row(context: Context, row: Dict[str, str]):
    qid = row.get("qid", "").strip()
    if not len(qid):
        return
    if not is_qid(qid):
        context.log.warning("No valid QID", qid=qid)
        return
    entity = context.make("Person")
    entity.id = qid
    entity.add("topics", "role.oligarch")
    context.emit(entity, target=True)
Пример #19
0
def make_entity(context: Context, el, schema, entity_id):
    entity = context.make(schema, target=True)
    entity.id = entity_id
    entity.add("notes", h.clean_note(el.findtext("./note")))
    entity.add("topics", "sanction")

    sanction = h.make_sanction(context, entity)
    sanction.add("summary", el.findtext("./correction"))
    context.emit(sanction)

    return entity
Пример #20
0
def salvage_entity(context: Context, entry):
    texts = [t.text for t in entry.findall("./remark")]
    assert len(texts) == 2, texts
    name, details = texts
    name = name.split("(", 1)[0]
    entity = context.make("LegalEntity")
    entity.id = context.make_id(name)
    entity.add("name", name)
    entity.add("notes", details)
    entity.add("topics", "sanction")
    parse_sanctions(context, entity, entry)
    context.emit(entity, target=True)
Пример #21
0
def parse_entity(context: Context, node):
    entity = context.make("LegalEntity")
    sanction = parse_common(context, entity, node)

    for alias in node.findall("./ENTITY_ALIAS"):
        parse_alias(entity, alias)

    for addr in node.findall("./ENTITY_ADDRESS"):
        h.apply_address(context, entity, parse_address(context, addr))

    context.emit(entity, target=True)
    context.emit(sanction)
Пример #22
0
def crawl(context: Context):
    url = crawl_index(context)
    if url is None:
        context.log.error("Could not locate XML file", url=context.dataset.url)
        return
    path = context.fetch_resource("source.xml", url)
    context.export_resource(path, "text/xml", title=context.SOURCE_TITLE)
    xml = context.parse_resource_xml(path)

    for person in xml.findall(".//KyrgyzPhysicPerson"):
        parse_person(context, person)
    for legal in xml.findall(".//KyrgyzLegalPerson"):
        parse_legal(context, legal)
Пример #23
0
def crawl(context: Context):
    path = context.fetch_resource("source.zip", context.dataset.data.url)
    context.export_resource(path,
                            "application/zip",
                            title=context.SOURCE_TITLE)
    with ZipFile(path, "r") as zip:
        for name in zip.namelist():
            if name.endswith(".xml"):
                with zip.open(name) as fh:
                    doc = etree.parse(fh)
                    doc = h.remove_namespace(doc)
                    for entry in doc.findall(".//sanctionEntity"):
                        parse_entry(context, entry)
Пример #24
0
def crawl(context: Context):
    path = context.fetch_resource("source.xml", context.dataset.data.url)
    context.export_resource(path, "text/xml", title=context.SOURCE_TITLE)
    doc = context.parse_resource_xml(path)

    for row in doc.findall(".//Table"):
        data = {}
        for field in row.getchildren():
            value = field.text
            if value == "NA":
                continue
            data[field.tag] = value
        crawl_row(context, data)
Пример #25
0
def crawl(context: Context):
    xls_url = fetch_xls_url(context)
    path = context.fetch_resource("source.xls", xls_url)
    context.export_resource(path, XLS, title=context.SOURCE_TITLE)

    xls = xlrd.open_workbook(path)
    for sheet in xls.sheets():
        headers = None
        row0 = [h.convert_excel_cell(xls, c) for c in sheet.row(0)]
        sections = [c for c in row0 if c is not None]
        section = collapse_spaces(" / ".join(sections))
        for r in range(1, sheet.nrows):
            row = [h.convert_excel_cell(xls, c) for c in sheet.row(r)]

            # after a header is found, read normal data:
            if headers is not None:
                data: Dict[str, List[str]] = {}
                for header, cell in zip(headers, row):
                    if header is None:
                        continue
                    values = []
                    if isinstance(cell, datetime):
                        cell = cell.date()
                    for value in multi_split(stringify(cell), SPLITS):
                        if value is None:
                            continue
                        if value == "不明":
                            continue
                        if value is not None:
                            values.append(value)
                    data[header] = values
                emit_row(context, sheet.name, section, data)

            if not len(row) or row[0] is None:
                continue
            teaser = row[0].strip()
            # the first column of the common headers:
            if "告示日付" in teaser:
                if headers is not None:
                    context.log.error("Found double header?", row=row)
                # print("SHEET", sheet, row)
                headers = []
                for cell in row:
                    cell = collapse_spaces(cell)
                    header = context.lookup_value("columns", cell)
                    if header is None:
                        context.log.warning("Unknown column title",
                                            column=cell,
                                            sheet=sheet.name)
                    headers.append(header)
Пример #26
0
def crawl(context: Context):
    path = context.fetch_resource("source.html", context.dataset.data.url)
    context.export_resource(path, HTML, title=context.SOURCE_TITLE)
    with open(path, "r", encoding="ISO-8859-1") as fh:
        doc = html.parse(fh)

    table = doc.find("//div[@id='viewcontainer']/table")
    headers = None
    for row in table.findall(".//tr"):
        if headers is None:
            headers = [
                slugify(c.text_content(), "_") for c in row.findall("./th")
            ]
            continue
        cells = [
            collapse_spaces(c.text_content()) for c in row.findall("./td")
        ]
        cells = dict(zip(headers, cells))
        cells.pop(None, None)

        full_name = name = cells.pop("name")
        registration_number = None
        for splitter in REG_NRS:
            if splitter in name:
                name, registration_number = name.split(splitter, 1)
                registration_number = registration_number.replace(")", "")

        country = cells.pop("nationality")
        country = country.replace("Non ADB Member Country", "")
        country = country.replace("Rep. of", "")
        entity = context.make("LegalEntity")
        entity.id = context.make_id(full_name, country)
        entity.add("name", name)
        entity.add("alias", cells.pop("othername_logo"))
        entity.add("topics", "debarment")
        entity.add("country", country)
        entity.add("registrationNumber", registration_number)

        sanction = h.make_sanction(context, entity)
        sanction.add("reason", cells.pop("grounds"))
        sanction.add("program", cells.pop("sanction_type"))
        date_range = cells.pop("effect_date_lapse_date", "")
        if "|" in date_range:
            start_date, end_date = date_range.split("|")
            sanction.add("startDate", h.parse_date(start_date.strip(),
                                                   FORMATS))
            sanction.add("endDate", h.parse_date(end_date.strip(), FORMATS))

        address = h.make_address(context,
                                 full=cells.pop("address"),
                                 country=country)
        h.apply_address(context, entity, address)

        context.emit(entity, target=True)
        context.emit(sanction)
Пример #27
0
def crawl_row(context: Context, row):
    entity = context.make("Person")
    tag = row.pop("Tag")
    name_en = row.pop("Name eng")
    dob = row.pop("DOB")
    entity.id = context.make_id(name_en, tag, dob)
    entity.add("name", name_en)
    entity.add("alias", row.get("Name cyrillic"))
    entity.add("birthDate", parse_date(dob))
    entity.add("notes", collapse_spaces(row.get("Description")))
    entity.add("position", tag.split("\n"))
    entity.add("gender", row.get("Gender"))

    context.emit(entity, target=True)
Пример #28
0
def crawl_row(context: Context, row: Dict[str, str]):
    qid = row.get("qid", "").strip()
    if not len(qid):
        return
    if not is_qid(qid):
        context.log.warning("No valid QID", qid=qid)
        return
    schema = row.get("schema") or "Person"
    entity = context.make(schema)
    entity.id = qid
    topics = [t.strip() for t in row.get("topics", "").split(";")]
    topics = [t for t in topics if len(t)]
    entity.add("topics", topics)
    context.emit(entity, target=True)
Пример #29
0
def export_dataset(dataset: Dataset, database: Database):
    """Dump the contents of the dataset to the output directory."""
    context = Context(dataset)
    context.bind()
    loader = database.view(dataset, export_assembler)
    exporters = [Exporter(context, loader) for Exporter in EXPORTERS]
    for entity in loader:
        for exporter in exporters:
            exporter.feed(entity)

    for exporter in exporters:
        exporter.finish()

    # Make sure the exported resources are visible in the database
    db.session.commit()

    # Export list of data issues from crawl stage
    issues_path = context.get_resource_path("issues.json")
    context.log.info("Writing dataset issues list", path=issues_path)
    with open(issues_path, "w", encoding=settings.ENCODING) as fh:
        data = {"issues": Issue.query(dataset=dataset).all()}
        write_json(data, fh)

    # Export full metadata
    index_path = context.get_resource_path("index.json")
    context.log.info("Writing dataset index", path=index_path)
    with open(index_path, "w", encoding=settings.ENCODING) as fh:
        meta = dataset.to_index()
        write_json(meta, fh)

    context.close()
Пример #30
0
def parse_identity(context: Context, entity, node, places):
    for name in node.findall(".//name"):
        parse_name(entity, name)

    for address in node.findall(".//address"):
        place = places.get(address.get("place-id"))
        address = compose_address(context, entity, place, address)
        h.apply_address(context, entity, address)

    for bday in node.findall(".//day-month-year"):
        bval = parse_parts(bday.get("year"), bday.get("month"),
                           bday.get("day"))
        if entity.schema.is_a("Person"):
            entity.add("birthDate", bval)
        else:
            entity.add("incorporationDate", bval)

    for nationality in node.findall(".//nationality"):
        country = nationality.find("./country")
        if country is not None:
            entity.add("nationality", country.get("iso-code"))
            entity.add("nationality", country.text)

    for bplace in node.findall(".//place-of-birth"):
        place = places.get(bplace.get("place-id"))
        address = compose_address(context, entity, place, bplace)
        entity.add("birthPlace", address.get("full"))

    for doc in node.findall(".//identification-document"):
        country = doc.find("./issuer")
        type_ = doc.get("document-type")
        number = doc.findtext("./number")
        entity.add("nationality", country.text, quiet=True)
        schema = "Identification"
        if type_ in ("id-card"):
            entity.add("idNumber", number)
        if type_ in ("passport", "diplomatic-passport"):
            entity.add("idNumber", number)
            schema = "Passport"
        passport = context.make(schema)
        passport.id = context.make_id(entity.id, type_, doc.get("ssid"))
        passport.add("holder", entity)
        passport.add("country", country.text)
        passport.add("number", number)
        passport.add("type", type_)
        passport.add("summary", doc.findtext("./remark"))
        passport.add("startDate", doc.findtext("./date-of-issue"))
        passport.add("endDate", doc.findtext("./expiry-date"))
        context.emit(passport)