def scrape_person(context, doc, url): hierarchy = doc.find( './/span[@itemtype="http://data-vocabulary.org/Breadcrumb"]') # Remove empty items in the list hierarchy = [ item.text_content() for item in hierarchy if item.text_content() and item.text_content().strip() ] # Strip first item ('institution') and last item ('name of person') hierarchy = hierarchy[1:-1] name = doc.find('.//h3[@itemprop="name"]').text_content() title = doc.findtext('.//td[@itemprop="jobTitle"]') entity_id = make_id(name, title) entity = Entity.create('eu-whoiswho', entity_id) entity.name = name entity.url = url entity.function = title address = entity.create_address() address.street = doc.findtext('.//span[@itemprop="streetAddress"]') address.postal_code = doc.findtext('.//span[@itemprop="postalCode"]') address.text = doc.findtext('.//span[@itemprop="addressLocality"]') # address.phone = doc.findtext('.//span[@itemprop="telephone"]') if len(hierarchy) > 1: entity.program = hierarchy[1] # pprint(entity.to_dict()) context.emit(data=entity.to_dict())
def handle_organisation(context, data): header = ["No", "Name", "Reason for inclusion", "Category of entity", "Date of inclusion"] data = {key: value for key, value in zip(header, data)} entity_id = make_id(data["Name"], data["Reason for inclusion"]) entity = Entity.create("kg-fiu-national", entity_id) entity.type = entity.TYPE_ENTITY if "," in data["Name"]: data["Name"] = data["Name"].split(",") else: data["Name"] = [data["Name"]] entity.name = data["Name"][0] for alias in data["Name"][1:]: entity.create_alias(alias) entity.program = data["Category of entity"] entity.summary = data["Reason for inclusion"] entity.listed_at = data["Date of inclusion"] # pprint(entity.to_dict()) context.emit(data=entity.to_dict())
def handle_individual(context, data): header = ["No", "Last Name", "Name", "Middle Name", "Date of birth", "Place of birth", "Reason for inclusion", "Category of entity", "Date of inclusion"] data = {key: value for key, value in zip(header, data)} entity_id = make_id(data["Last Name"], data["Middle Name"], data["Name"], data["Reason for inclusion"]) entity = Entity.create("kg-fiu-national", entity_id) entity.type = entity.TYPE_INDIVIDUAL entity.last_name = data["Last Name"] entity.first_name = data["Name"] entity.second_name = data["Middle Name"] birth_date = entity.create_birth_date() birth_date.date = data["Date of birth"] birth_place = entity.create_birth_date() birth_place.place = data["Place of birth"] entity.program = data["Category of entity"] entity.summary = data["Reason for inclusion"] entity.listed_at = data["Date of inclusion"] # pprint(entity.to_dict()) context.emit(data=entity.to_dict())
def load_features(context, data, seen, extent): q = QUERY.copy() if data["token"] is not None: q["token"] = data["token"] q["geometry"] = json.dumps(extent) url = "%s/%s/query" % (data["rest_url"], data["id"]) # TODO: For some weird reason, using context.http returns 403 errors. # Things I have tried: reseting the session, setting stealth to true so # that the User-Agent is randomized, getting rid of all request headers. # None of it helps. So using requests instead because it works. res = requests.get(url, params=q).json() for feature in res.get("features", []): attrs = feature.get("attributes") obj = make_id( data.get("name"), attrs.get("guidPart"), attrs.get("OBJECTID_1"), attrs.get("OBJECTID_12"), attrs.get("OBJECTID"), attrs.get("ESRI_OID"), ) if obj is None: context.log.info("Missing ID: %r", attrs.keys()) if obj not in seen: seen.add(obj) attrs["FeatureId"] = obj yield attrs if res.get("exceededTransferLimit"): for child in split_envelope(extent): for attrs in load_features(context, data, seen, child): yield attrs
def parse_entry(context, entry, url, updated_at): uid = entry.findtext('uid') type_ = ENTITY_TYPES[entry.findtext('./sdnType')] if type_ is None: return entity = Entity.create('us-ofac', make_id(url, uid)) entity.type = type_ entity.updated_at = updated_at programs = [p.text for p in entry.findall('./programList/program')] entity.program = '; '.join(programs) entity.summary = entry.findtext('./remarks') entity.function = entry.findtext('./title') entity.first_name = entry.findtext('./firstName') entity.last_name = entry.findtext('./lastName') for aka in entry.findall('./akaList/aka'): alias = entity.create_alias() alias.first_name = aka.findtext('./firstName') alias.last_name = aka.findtext('./lastName') alias.type = aka.findtext('./type') alias.quality = ALIAS_QUALITY[aka.findtext('./category')] for ident in entry.findall('./idList/id'): type_ = ID_TYPES.get(ident.findtext('./idType'), Identifier.TYPE_OTHER) if type_ is None: continue identifier = entity.create_identifier() identifier.type = type_ identifier.number = ident.findtext('./idNumber') identifier.country = ident.findtext('./idCountry') identifier.description = ident.findtext('./idType') for addr in entry.findall('./addressList/address'): address = entity.create_address() address.street = addr.findtext('./address1') address.street_2 = addr.findtext('./address2') address.city = addr.findtext('./city') address.country = addr.findtext('./country') for pob in entry.findall('./placeOfBirthList/placeOfBirthItem'): birth_place = entity.create_birth_place() birth_place.place = pob.findtext('./placeOfBirth') birth_place.quality = BirthPlace.QUALITY_WEAK if pob.findtext('./mainEntry') == 'true': birth_place.quality = BirthPlace.QUALITY_STRONG for pob in entry.findall('./dateOfBirthList/dateOfBirthItem'): birth_date = entity.create_birth_date() birth_date.date = stringify(parse_date(pob.findtext('./dateOfBirth'))) birth_date.quality = BirthDate.QUALITY_WEAK if pob.findtext('./mainEntry') == 'true': birth_date.quality = BirthDate.QUALITY_STRONG # pprint(entity.to_dict()) context.emit(data=entity.to_dict())
def scrape_case(context, data): url = data.get('url') res = context.http.get(url) doc = res.html name = element_text(doc.find('.//div[@class="nom_fugitif_wanted"]')) if name is None or name == 'Identity unknown': return uid = make_id(url) entity = Entity.create('interpol-red-notices', uid) entity.url = url entity.type = entity.TYPE_INDIVIDUAL entity.name = name entity.program = element_text(doc.find('.//span[@class="nom_fugitif_wanted_small"]')) # noqa if ', ' in name: last, first = name.split(', ', 1) alias = entity.create_alias() alias.name = ' '.join((first, last)) for row in doc.findall('.//div[@class="bloc_detail"]//tr'): title, value = row.findall('./td') name = slugify(element_text(title), sep='_') value = element_text(value) if value is None: continue if name == 'charges': entity.summary = value elif name == 'present_family_name': entity.last_name = value elif name == 'forename': entity.first_name = value elif name == 'nationality': for country in value.split(', '): nationality = entity.create_nationality() nationality.country = country elif name == 'sex': entity.gender = SEXES[value] elif name == 'date_of_birth': birth_date = entity.create_birth_date() birth_date.date = value.split('(')[0] elif name == 'place_of_birth': birth_place = entity.create_birth_place() birth_place.date = value # pprint(entity.to_dict()) context.emit(data=entity.to_dict())
def parse_row(context, data): row = data.get('row') uid = make_id(row.get('Effective_Date'), row.get('Name')) entity = Entity.create('us-bis-denied', uid) entity.type = Entity.TYPE_ENTITY entity.name = row.get('Name') entity.updated_at = row.get('Effective_Date') entity.program = row.get('FR_Citation') entity.summary = row.get('Action') address = entity.create_address() address.street = row.get('Street_Address') address.postal_code = row.get('Postal_Code') address.region = row.get('State') address.city = row.get('City') address.country = row.get('Country') # pprint(entity.to_dict()) context.emit(data=entity.to_dict())
def parse_entry(context, entry): url = entry.get('href') res = context.http.get('https://www.worldpresidentsdb.com/' + url) doc = res.html content = doc.find('.//main/div') uid = make_id(url) entity = Entity.create('worldpresidentsdb', uid) entity.type = Entity.TYPE_INDIVIDUAL entity.function = 'President' entity.url = url entity.first_name, entity.last_name = content.find('h1').text.split(' ', 1) for element in content.findall('.//p'): type = element.find('.//b') if type is None: continue else: type = type.text if type == 'Country:': nationality = entity.create_nationality() nationality.country = element.find('a').text elif type == 'Date of Birth:': value = element[0].tail.strip() month, day, year = value.split('-', 2) birth_date = entity.create_birth_date() birth_date.date = year + '-' + month + '-' + day birth_date.quality = 'strong' elif type == 'Birth Place:': value = element[0].tail.strip() birth_place = entity.create_birth_place() birth_place.place = value elif type == 'Political Party:': value = element[0].tail.strip() entity.program = value elif type == 'Other Political Titles:': value = element[0].tail.strip() entity.summary = value # pprint(entity.to_dict()) context.emit(data=entity.to_dict())
def parse_entity(context, url, country, component, row, updated_at): function = element_text(row.find('.//span[@class="title"]')) if function is None: return name = element_text(row.find('.//span[@class="cos_name"]')) if name is None: return uid = make_id(country, name, function) entity = Entity.create('us-cia-world-leaders', uid) entity.name = name entity.type = entity.TYPE_INDIVIDUAL entity.function = function entity.program = country entity.url = url entity.updated_at = updated_at nationality = entity.create_nationality() nationality.country = country # pprint(entity.to_dict()) context.emit(data=entity.to_dict())
def load_features(context, data, seen, extent): q = QUERY.copy() if data['token'] is not None: q['token'] = data['token'] q['geometry'] = json.dumps(extent) url = '%s/%s/query' % (data['rest_url'], data['id']) res = context.http.get(url, params=q) for feature in res.json.get('features', []): attrs = feature.get('attributes') obj = make_id(data.get('name'), attrs.get('guidPart'), attrs.get('OBJECTID_1'), attrs.get('OBJECTID_12'), attrs.get('OBJECTID'), attrs.get('ESRI_OID')) if obj is None: context.log.info("Missing ID: %r", attrs.keys()) if obj not in seen: seen.add(obj) attrs['FeatureId'] = obj yield attrs if res.json.get('exceededTransferLimit'): for child in split_envelope(extent): for attrs in load_features(context, data, seen, child): yield attrs