def crawl_legislature(context: Context, country, legislature): lastmod_ = int(legislature.get("lastmod")) lastmod = datetime.utcfromtimestamp(lastmod_) url = legislature.get("popolo_url") # this isn't being updated, hence long interval: data = context.fetch_json(url, cache_days=30) persons: Dict[str, Optional[str]] = {} for person in data.pop("persons", []): pid = person.get("id") persons[pid] = parse_person(context, person, country, lastmod) organizations: Dict[str, Optional[str]] = {} for org in data.pop("organizations", []): org_id = org.pop("id", None) org_id = context.lookup_value("org_id", org_id, org_id) if org_id is None: continue name = org.pop("name", org.pop("sort_name", None)) organizations[org_id] = name events = data.pop("events", []) events = {e.get("id"): e for e in events} for membership in data.pop("memberships", []): parse_membership(context, membership, persons, organizations, events)
def crawl(context: Context): params = {"_": settings.RUN_DATE} res = context.fetch_json(context.dataset.data.url, params=params) data = res.get("result", {}).get("data", {}) for edge in data.get("governments", {}).get("edges", []): node = edge.get("node", {}) crawl_country(context, params, node.get("path"), node.get("title"))
def crawl(context: Context): url = context.dataset.data.url headers = {"apikey": context.dataset.data.api_key} data = context.fetch_json(url, headers=headers) # TODO write this out to a source.json for data in data["response"]["ZPROCSUPP"]: # context.pprint(data) entity = context.make("LegalEntity") name = data.get("SUPP_NAME") ent_id = data.get("SUPP_ID") entity.id = context.make_slug(ent_id) names = clean_name(name) entity.add("name", names[0]) entity.add("topics", "debarment") entity.add("country", data.get("COUNTRY_NAME")) for name in names[1:]: entity.add("alias", name) address = h.make_address( context, street=data.get("SUPP_ADDR"), city=data.get("SUPP_CITY"), country=data.get("COUNTRY_NAME"), key=entity.id, ) h.apply_address(context, entity, address) sanction = h.make_sanction(context, entity) sanction.add("program", data.get("DEBAR_REASON")) sanction.add("startDate", h.parse_date(data.get("DEBAR_FROM_DATE"), FORMATS)) sanction.add("endDate", h.parse_date(data.get("DEBAR_TO_DATE"), FORMATS)) context.emit(entity, target=True) context.emit(sanction)
def crawl(context: Context): data = context.fetch_json(context.dataset.data.url) for country in data: for legislature in country.get("legislatures", []): code = country.get("code").lower() context.log.info("Country: %s" % code) crawl_legislature(context, code, legislature)
def http_get(context: Context, url, params=None, cache_days=None): try: return context.fetch_json( url, params=params, auth=AUTH, cache_days=cache_days, ) except HTTPError as err: if err.response.status_code in (429, 416): raise AbortCrawl() context.log.info("HTTP error: %r", err)
def crawl(context: Context): data = context.fetch_json(context.dataset.data.url) for ban in data.get("data", {}).get("travelBansFiles"): if not ban.get("fileName").endswith(".xml"): continue data_url = URL % ban.get("id") path = context.fetch_resource("source.xml", data_url) context.export_resource(path, "text/xml", title=context.SOURCE_TITLE) doc = context.parse_resource_xml(path) doc = h.remove_namespace(doc) for entry in doc.findall(".//sanctionEntity"): subject_type = entry.find("./subjectType") if subject_type is None: salvage_entity(context, entry) continue parse_entry(context, entry)
def crawl_country(context: Context, country, age_max=120, age_min=0): params = { "ageMin": int(age_min), "ageMax": int(age_max), # "arrestWarrantCountryId": country, "nationality": country, "resultPerPage": MAX_RESULTS, } try: data = context.fetch_json(context.dataset.data.url, params=params) except HTTPError as err: context.log.warning( "HTTP error", url=str(err.request.url), country=country, error=err.response.status_code, ) return # if res.status_code != 200: # if not res.from_cache: # time.sleep(0.5) # data = res.json() notices = data.get("_embedded", {}).get("notices", []) for notice in notices: crawl_notice(context, notice) total = data.get("total") # pprint((country, total, age_max, age_min)) if total > MAX_RESULTS: age_range = age_max - age_min if age_range > 1: age_split = age_min + (age_range // 2) crawl_country(context, country, age_max, age_split) crawl_country(context, country, age_split, age_min) elif age_range == 1: crawl_country(context, country, age_max, age_max) crawl_country(context, country, age_min, age_min)
def crawl_country(context: Context, params, path, country): source_url = UI_URL % path context.log.debug("Crawling country: %s" % country) res = context.fetch_json(DATA_URL % path, params=params) data = res.get("result", {}).get("data", {}).get("page", {}) blocks = data.get("acf", {}).get("blocks", [{}])[0] content = blocks.get("free_form_content", []).get("content") doc = html.fromstring(content) function = None for i, el in enumerate(doc.getchildren()): text = el.text_content().strip() if el.tag == "h2": continue if el.tag == "h3": function = text continue if i == 0 and el.tag == "p": # this paragraph at the start is a note, not a person continue name = text.replace("(Acting)", "") if is_empty(name): continue context.log.debug( "Person", country=country, name=name, function=function, url=source_url, ) person = context.make("Person") person.id = context.make_slug(country, name, function) person.add("name", name) person.add("country", country) person.add("position", function) person.add("sourceUrl", source_url) person.add("topics", "role.pep") context.emit(person, target=True)
def crawl(context: Context): for page in count(1): url = str(context.dataset.data.url) url = url.replace("pPageNumber=1", "pPageNumber=%s" % page) headers = { "Accept": "application/json", "Referer": "https://www.iadb.org/en/transparency/sanctioned-firms-and-individuals", } page_data = context.fetch_json(url, headers=headers) ids = [] for row in page_data: for field, value in list(row.items()): if value == "N/A": row[field] = "" row_id = row.pop("id") ids.append(row_id) entity_type = row.pop("entity") schema = context.lookup_value("types", entity_type) if schema is None: context.log.warning("Unknown entity type", entity=entity_type) continue entity = context.make(schema) entity.id = context.make_slug(row_id) entity.add("name", row.pop("firmName")) entity.add("topics", "debarment") entity.add("alias", row.pop("additionalName")) entity.add("notes", row.pop("title")) entity.add("notes", row.pop("additionalTitle")) entity.add("country", parse_countries(row.pop("country"))) nat = "nationality" if schema == "Company": nat = "jurisdiction" entity.add(nat, parse_countries(row.pop("nationality"))) affiliated = row.pop("affiliatedWithEntityId") if len(affiliated): link = context.make("UnknownLink") link.id = context.make_id(row_id, affiliated) link.add("subject", entity.id) link.add("object", context.make_slug(affiliated)) context.emit(link) sanction = h.make_sanction(context, entity) sanction.add("status", row.pop("statusName")) sanction.add("reason", row.pop("grounds")) sanction.add("authority", row.pop("source")) sanction.add("authority", row.pop("idBinstSource")) sanction.add("program", row.pop("idBinstType")) sanction.add("startDate", h.parse_date(row.pop("datefrom"), FORMATS)) sanction.add("endDate", h.parse_date(row.pop("dateto"), FORMATS)) # context.pprint(row) context.emit(sanction) context.emit(entity, target=True) if min(ids) == 1: return